bToday代码提交

This commit is contained in:
Achilles
2023-12-26 18:00:33 +08:00
parent d28e97d9ba
commit 7b693410ae
5 changed files with 361 additions and 0 deletions

View File

@@ -4,10 +4,13 @@ import java.lang.Integer;
import java.lang.String;
import java.util.Date;
import javax.persistence.Entity;
import javax.persistence.GeneratedValue;
import javax.persistence.GenerationType;
import javax.persistence.Id;
import javax.persistence.Table;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.Generated;
import lombok.NoArgsConstructor;
import lombok.experimental.SuperBuilder;
import org.hibernate.annotations.DynamicInsert;
@@ -34,6 +37,9 @@ public class BtodayStockPO {
/**
* 主键 */
@Id
@GeneratedValue(
strategy = GenerationType.IDENTITY
)
Integer id;
/**

View File

@@ -0,0 +1,217 @@
package cn.stock.market.infrastructure.job;
import cn.stock.market.domain.basic.entity.BtodayStock;
import cn.stock.market.domain.basic.repository.BtodayStockRepository;
import cn.stock.market.infrastructure.db.repo.BtodayStockRepo;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import lombok.extern.slf4j.Slf4j;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.stream.Collectors;
@Slf4j
@RestController
public class Scraper {
@Autowired
private BtodayStockRepository btodayStockRepo;
private final ExecutorService executorService = Executors.newFixedThreadPool(5);
@Scheduled(cron = "0 0 1 */2 * ?")
@RequestMapping("/testScraperGetBusinessToday")
public void schedule() {
String BASE_URL = "https://akm-img-a-in.tosshub.com/businesstoday/resource/market-widgets/prod/company-master-23-01-2023.json";
String company_name = "Bhagawati Oxygen Ltd";
try {
// 获取 JSON 数据
String json_data = scrapePage(BASE_URL);
// 解析 JSON 数据
if (json_data != null) {
List<BtodayStock> all = btodayStockRepo.findAll();
Map<String, String> sefUrlList = getSefUrl(json_data, company_name);
sefUrlList = sefUrlList.entrySet().stream()
.filter(entry -> all.stream().noneMatch(stock -> stock.getStockName().equals(entry.getKey())))
.collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
// 将 Map 中的数据分成 5 个线程处理
int batchSize = sefUrlList.size() / 5; // 假设分成 5 个线程
int threadCount = 5;
CompletableFuture<Void>[] futures = new CompletableFuture[threadCount];
for (int i = 0; i < threadCount; i++) {
int startIndex = i * batchSize;
int endIndex = (i == threadCount - 1) ? sefUrlList.size() : (i + 1) * batchSize;
Map<String, String> subMap = sefUrlList.entrySet().stream()
.skip(startIndex)
.limit(endIndex - startIndex)
.collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
futures[i] = CompletableFuture.runAsync(() -> processSubMap(subMap), executorService);
}
// 等待所有异步任务完成
CompletableFuture.allOf(futures).get();
}
} catch (Exception e) {
log.error("IOException occurred while processing the JSON data", e);
}finally {
// 关闭线程池
executorService.shutdown();
}
}
private void processSubMap(Map<String, String> sefUrlList) {
for (Map.Entry<String, String> entry : sefUrlList.entrySet()) {
String companyName = entry.getKey();
String url = entry.getValue();
// 获取网页 HTML
String webHtml = null;
int maxRetries = 5;
int retryCount = 0;
while (retryCount < maxRetries) {
try {
webHtml = getWebsiteHtml(url);
break; // If successful, break out of the loop
} catch (java.net.SocketTimeoutException e) {
log.warn("Socket timeout exception occurred. Retrying... (" + (retryCount + 1) + "/" + maxRetries + ")");
retryCount++;
} catch (IOException e) {
log.warn("IOException occurred. Retrying... (" + (retryCount + 1) + "/" + maxRetries + ")");
retryCount++;
}
}
if (webHtml != null) {
// 获取公司代码
String coCode = getCompanyCode(webHtml);
if (coCode != null) {
// 获取股票市场列表
String[] stockMarketList = getStockMarket(webHtml);
for (String stockMarket : stockMarketList) {
// 获取网页详情
String detailUrl = buildWebDetailUrl(coCode, stockMarket);
// String webInfo = getWebDetail(detailUrl);
log.info("Stock detail coCode:{}, stockMarket:{}: ,detailUrl:{}", coCode, stockMarket,detailUrl);
BtodayStock btodayStock = new BtodayStock();
btodayStock.setStockName(companyName);
btodayStock.setCoCode(coCode);
btodayStock.setStockType(stockMarket);
btodayStock.setSelfUrl(url);
btodayStock.setUrl(detailUrl);
btodayStock.setLastUpdateTime(new Date());
btodayStockRepo.save(btodayStock);
/* if (webInfo != null) {
log.info("Stock detail for {} in {}: {}", coCode, stockMarket, webInfo);
log.info(webInfo);
} else {
log.warn("Failed to retrieve web detail information.");
}*/
}
} else {
log.warn("Failed to retrieve company code.");
}
} else {
log.warn("Failed to retrieve website HTML.");
}
}
}
private static String buildWebDetailUrl(String coCode, String stockMarket) {
return "https://marketapi.intoday.in/widget/stockdetail/pullview?co_code=" + coCode + "&exchange=" + stockMarket;
}
private static String scrapePage(String url) throws IOException {
log.info("Scraping " + url + "...");
return Jsoup.connect(url).ignoreContentType(true).execute().body();
}
private static Map<String, String> getSefUrl(String json_data, String companyName) {
Map<String, String> sefUrls = new HashMap<>();
// 在这里放入你的模糊匹配逻辑
JSONArray jsonArray = JSONArray.parseArray(json_data);
for (int i = 0; i < jsonArray.size(); i++) {
JSONObject item = jsonArray.getJSONObject(i);
// 在这里放入你的模糊匹配逻辑
String sef_url = item.getString("sef_url");
String company_name = item.getString("companyname");
if (company_name != null && sef_url != null/* && company_name.equals(companyName)*/) {
sefUrls.put(company_name, sef_url);
}
}
return sefUrls;
}
private static String getWebsiteHtml(String url) throws IOException {
log.info("Getting website URL: " + url + "...");
return Jsoup.connect(url).timeout(10000).get().html();
}
private static String getCompanyCode(String text) {
Document document = Jsoup.parse(text);
Element companyCodeInput = document.selectFirst("input[id=comapnyCodeId]");
if (companyCodeInput != null) {
return companyCodeInput.attr("value");
} else {
log.warn("No <input> with id=\"companyCodeId\" found on the website.");
return null;
}
}
private static String[] getStockMarket(String text) {
Document document = Jsoup.parse(text);
Elements ulElements = document.select("ul[class*=wdg_rhs_hdr_ul]");
List<String> stockMarketList = new ArrayList<>();
for (Element ulElement : ulElements) {
Elements liElements = ulElement.select("li");
for (Element liElement : liElements) {
Element spanElement = liElement.selectFirst("span[class=wdg_rhs_hdr_lnk]");
if (spanElement != null) {
stockMarketList.add(spanElement.text());
} else {
log.warn("Invalid status code while scraping.");
}
}
}
return stockMarketList.toArray(new String[0]);
}
private static String getWebDetail(String url) throws IOException {
log.info("Getting web detail URL: " + url + "...");
return Jsoup.connect(url).ignoreContentType(true).execute().body();
}
}