money control代码提交

This commit is contained in:
Achilles
2024-01-03 22:27:54 +08:00
parent 2faa760ad5
commit 15b654e425

View File

@@ -29,6 +29,7 @@ import java.util.concurrent.ExecutionException;
import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;
/** /**
* @author gs * @author gs
@@ -91,9 +92,18 @@ public class MoneyScraper {
} }
/**
* 带有A B 分组的url
* @param url
* @param httpClient
* @param letter
* @return
* @throws IOException
*/
private List<String> sendHttpRequest(String url, HttpClient httpClient, String letter) throws IOException { private List<String> sendHttpRequest(String url, HttpClient httpClient, String letter) throws IOException {
List<MoneyStock> allMoneyStock = moneyStockRepository.findAll();
Document document = fetchStockDetails(url); Document document = fetchStockDetails(url);
extractExchangeDetails(document); extractExchangeDetails(document,allMoneyStock);
List<String> result = new ArrayList<>(); List<String> result = new ArrayList<>();
result.add("Thread " + Thread.currentThread().getName() + " processed letters: " + letter); result.add("Thread " + Thread.currentThread().getName() + " processed letters: " + letter);
return result; return result;
@@ -137,6 +147,11 @@ public class MoneyScraper {
} }
/**
* 获取全部股票url_self
* @param url
* @return
*/
public static Document fetchStockDetails(String url) { public static Document fetchStockDetails(String url) {
return fetchDocumentWithRetry(url); return fetchDocumentWithRetry(url);
} }
@@ -173,16 +188,19 @@ public class MoneyScraper {
} }
public void extractExchangeDetails(Document soup) { public void extractExchangeDetails(Document soup,List<MoneyStock> moneyStockHadSavedList) {
Elements companies = soup.select("table.pcq_tbl.MT10"); Elements companies = soup.select("table.pcq_tbl.MT10");
List<String> hadSaveUrl = moneyStockHadSavedList.stream().map(MoneyStock::getSelfUrl).collect(Collectors.toList());
for (Element company : companies) { for (Element company : companies) {
Elements elements = company.select("tr > td > a"); Elements elements = company.select("tr > td > a");
for (Element element : elements) { for (Element element : elements) {
String textContent = element.text().trim(); String textContent = element.text().trim();
String linkAttribute = element.attr("href"); String linkAttribute = element.attr("href");
if(hadSaveUrl.contains(linkAttribute)){
log.error(Thread.currentThread().getName()+"已经存在了不需要重复保存,company_name: " + textContent + ", Link Attribute: " + linkAttribute);
continue;
}
log.info(Thread.currentThread().getName()+",Text Content: " + textContent + ", Link Attribute: " + linkAttribute); log.info(Thread.currentThread().getName()+",Text Content: " + textContent + ", Link Attribute: " + linkAttribute);
Document soup2 = fetchCompanyDetails(linkAttribute); Document soup2 = fetchCompanyDetails(linkAttribute);