diff --git a/src/main/java/cn/stock/market/infrastructure/job/MoneyScraper.java b/src/main/java/cn/stock/market/infrastructure/job/MoneyScraper.java index 68cdeba..ef38799 100644 --- a/src/main/java/cn/stock/market/infrastructure/job/MoneyScraper.java +++ b/src/main/java/cn/stock/market/infrastructure/job/MoneyScraper.java @@ -29,6 +29,7 @@ import java.util.concurrent.ExecutionException; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; +import java.util.stream.Collectors; /** * @author gs @@ -91,9 +92,18 @@ public class MoneyScraper { } + /** + * 带有A B 分组的url + * @param url + * @param httpClient + * @param letter + * @return + * @throws IOException + */ private List sendHttpRequest(String url, HttpClient httpClient, String letter) throws IOException { + List allMoneyStock = moneyStockRepository.findAll(); Document document = fetchStockDetails(url); - extractExchangeDetails(document); + extractExchangeDetails(document,allMoneyStock); List result = new ArrayList<>(); result.add("Thread " + Thread.currentThread().getName() + " processed letters: " + letter); return result; @@ -137,6 +147,11 @@ public class MoneyScraper { } + /** + * 获取全部股票url_self + * @param url + * @return + */ public static Document fetchStockDetails(String url) { return fetchDocumentWithRetry(url); } @@ -173,16 +188,19 @@ public class MoneyScraper { } - public void extractExchangeDetails(Document soup) { + public void extractExchangeDetails(Document soup,List moneyStockHadSavedList) { Elements companies = soup.select("table.pcq_tbl.MT10"); - + List hadSaveUrl = moneyStockHadSavedList.stream().map(MoneyStock::getSelfUrl).collect(Collectors.toList()); for (Element company : companies) { Elements elements = company.select("tr > td > a"); for (Element element : elements) { String textContent = element.text().trim(); String linkAttribute = element.attr("href"); - + if(hadSaveUrl.contains(linkAttribute)){ + log.error(Thread.currentThread().getName()+"已经存在了不需要重复保存,company_name: " + textContent + ", Link Attribute: " + linkAttribute); + continue; + } log.info(Thread.currentThread().getName()+",Text Content: " + textContent + ", Link Attribute: " + linkAttribute); Document soup2 = fetchCompanyDetails(linkAttribute);