From 90a26823f0ba28318d1be89569868c9cbca44d07 Mon Sep 17 00:00:00 2001 From: vu-tran Date: Tue, 13 May 2025 10:10:42 +0700 Subject: [PATCH] update news --- .../domain/basic/service/StockService.java | 117 +++++++++++++----- .../stock/market/utils/HttpClientRequest.java | 50 ++++++++ src/main/resources/application-base-alpha.yml | 6 +- 3 files changed, 141 insertions(+), 32 deletions(-) diff --git a/src/main/java/cn/stock/market/domain/basic/service/StockService.java b/src/main/java/cn/stock/market/domain/basic/service/StockService.java index c21f982..b0ad5c3 100644 --- a/src/main/java/cn/stock/market/domain/basic/service/StockService.java +++ b/src/main/java/cn/stock/market/domain/basic/service/StockService.java @@ -255,47 +255,106 @@ public class StockService { // .header("Accept-Language", "en-US,en;q=0.9") // .get(); String url = "https://www.business-standard.com/markets/news"; - Document doc = Jsoup.connect(url) - .referrer("https://www.business-standard.com/") - .header("Accept-Language", "en-US,en;q=0.9") - .userAgent("Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1") - .timeout(5000) // timeout 5 seconds - .get(); - // result = doc.html().substring(doc.html().indexOf("
"),doc.html().lastIndexOf("
")+500); - Elements divElements = doc.select("div.listingstyle_cardlistlist__dfq57"); - StringBuilder sb = new StringBuilder(); - for (Element divElement : divElements) { - sb.append(divElement.outerHtml()).append("\n"); - } - result = sb.toString(); +// Document doc = Jsoup.connect(url) +// .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36") +// .header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") +// .header("Accept-Language", "en-US,en;q=0.5") +// .header("Connection", "keep-alive") +// .referrer("https://www.google.com") +// .ignoreHttpErrors(true) +// .timeout(10000) +// .get(); + String doGetNews = HttpClientRequest.doGetNews(url); +// result = doc.html().substring(doc.html().indexOf("
"),doc.html().lastIndexOf("
")+500); +// Elements divElements = doc.select("div.listingstyle_cardlistlist__dfq57"); +// StringBuilder sb = new StringBuilder(); +// for (Element divElement : divElements) { +// sb.append(divElement.outerHtml()).append("\n"); +// } +// result = sb.toString(); + result = extractNewsFromHtml(doGetNews); } catch (Exception e) { return e.toString(); } return result; } + private String extractNewsFromHtml(String rawHtml) { + Document doc = Jsoup.parse(rawHtml); + + // Optional: extract raw block around
+ int start = rawHtml.indexOf("
"); + int end = rawHtml.lastIndexOf("
") + 500; + String result = (start != -1 && end > start) ? rawHtml.substring(start, end) : ""; + + // Extract article list using CSS selector + Elements divElements = doc.select("div.listingstyle_cardlistlist__dfq57"); + StringBuilder sb = new StringBuilder(); + for (Element divElement : divElements) { + sb.append(divElement.outerHtml()).append("\n"); + } + + // If you only want the div content: + result = sb.toString(); + return result; + } + public List getNewsInfo(String url) { +// String result = ""; +// List list = new ArrayList<>(); +// try { +// // 使用Jsoup连接到网页 +//// Document doc = Jsoup.connect(url) +//// .header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36") +//// .header("Referer", "https://www.business-standard.com/") +//// .header("Accept-Language", "en-US,en;q=0.9") +//// .get(); +// Document doc = Jsoup.connect(url) +// .referrer("https://www.business-standard.com/") +// .header("Accept-Language", "en-US,en;q=0.9") +// .userAgent("Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1") +// .timeout(5000) // timeout 5 seconds +// .get(); +// result = doc.html().substring(doc.html().indexOf("articleBody") + 14, doc.html().indexOf(",\"author\":") - 1); +// list.add(result); +// list.add(doc.html().substring(doc.html().indexOf("og:title") + 19, doc.html().indexOf(" list = new ArrayList<>(); try { - // 使用Jsoup连接到网页 -// Document doc = Jsoup.connect(url) -// .header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36") -// .header("Referer", "https://www.business-standard.com/") -// .header("Accept-Language", "en-US,en;q=0.9") -// .get(); - Document doc = Jsoup.connect(url) - .referrer("https://www.business-standard.com/") - .header("Accept-Language", "en-US,en;q=0.9") - .userAgent("Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1") - .timeout(5000) // timeout 5 seconds - .get(); - result = doc.html().substring(doc.html().indexOf("articleBody") + 14, doc.html().indexOf(",\"author\":") - 1); + // Step 1: Get raw HTML via HttpClient + String rawHtml = HttpClientRequest.doGetNews(url); + + // Step 2: Parse HTML with Jsoup + Document doc = Jsoup.parse(rawHtml); + + // Step 3: Extract content manually (like original) + int startIdx = rawHtml.indexOf("articleBody") + 14; + int endIdx = rawHtml.indexOf(",\"author\":") - 1; + + if (startIdx > 0 && endIdx > startIdx) { + result = rawHtml.substring(startIdx, endIdx); + } else { + result = "[articleBody not found]"; + } list.add(result); - list.add(doc.html().substring(doc.html().indexOf("og:title") + 19, doc.html().indexOf(" 0 && titleEnd > titleStart) { + list.add(rawHtml.substring(titleStart, titleEnd)); + } else { + list.add("[title not found]"); + } + } catch (Exception e) { - list.add(e.toString()); - return list; + list.add("[Error] " + e.getMessage()); } return list; } diff --git a/src/main/java/cn/stock/market/utils/HttpClientRequest.java b/src/main/java/cn/stock/market/utils/HttpClientRequest.java index e6c7c9c..50498da 100644 --- a/src/main/java/cn/stock/market/utils/HttpClientRequest.java +++ b/src/main/java/cn/stock/market/utils/HttpClientRequest.java @@ -265,6 +265,56 @@ public class HttpClientRequest { return result; } + public static String doGetNews(String url) { + CloseableHttpClient httpClient = null; + CloseableHttpResponse response = null; + String result = ""; + + try { + httpClient = HttpClients.createDefault(); + + HttpGet httpGet = new HttpGet(url); + + // Spoof real browser headers + httpGet.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7"); + httpGet.setHeader("Accept-Language", "en-US,en;q=0.9,vi;q=0.8,ug;q=0.7,fr;q=0.6"); + httpGet.setHeader("User-Agent", "Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1"); + httpGet.setHeader("Priority", "u=0, i"); + httpGet.setHeader("Sec-Fetch-Dest", "document"); + httpGet.setHeader("Sec-Fetch-Mode", "navigate"); + httpGet.setHeader("Sec-Fetch-Site", "none"); + httpGet.setHeader("Sec-Fetch-User", "?1"); + httpGet.setHeader("Upgrade-Insecure-Requests", "1"); + + // Set cookies exactly like in curl + httpGet.setHeader("Cookie", "userUid=1747102033185-d570fba9-62fd-40be-93ca-ed08b4de57d4; _sid=MTc0NzEwMjAzMzE4Ni4ycTU%3D; _scor_uid=135c13065ff84620b5318b489af93e87; _gcl_au=1.1.249135292.1747102036; _ga=GA1.1.1031614211.1747102037; WZRK_G=28895afb56ff48dda59fe8de0af746bf; FCNEC=%5B%5B%22AKsRol8sHYeSYz_FYPkInYXN3P4ZDPfVKbsRfILfDuOMLhDtkTuoCJP5MlvT9gIbOe7IlDfY8ZeHszhwdVtAoKF1gWv0pLAq5EqpLpse8CEm_ZNv-bUSs6zEyqpOkeKWFI_Ei6VfNAvnZAO8PcXdF8_ncsaO902X7g%3D%3D%22%5D%5D; _ga_KRGL1M61LX=GS2.1.s1747105135$o2$g0$t1747105135$j60$l0$h0"); + + RequestConfig requestConfig = RequestConfig.custom() + .setConnectTimeout(10000) + .setSocketTimeout(15000) + .setConnectionRequestTimeout(10000) + .build(); + httpGet.setConfig(requestConfig); + + response = httpClient.execute(httpGet); + HttpEntity entity = response.getEntity(); + + if (entity != null) { + result = EntityUtils.toString(entity); + } + } catch (IOException e) { + e.printStackTrace(); + } finally { + try { + if (response != null) response.close(); + if (httpClient != null) httpClient.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } + return result; + } + public static void main(String[] args) { String url = "https://marketapi.intoday.in/widget/topgainer/view?exchange=nse"; String str = doGet(url); diff --git a/src/main/resources/application-base-alpha.yml b/src/main/resources/application-base-alpha.yml index 590859e..9b896bb 100644 --- a/src/main/resources/application-base-alpha.yml +++ b/src/main/resources/application-base-alpha.yml @@ -3,9 +3,9 @@ spring: show-sql: true # Redis配置 redis: - host: 43.156.40.39 + host: 43.153.174.179 password: a5v8b86P4mVzFlUqJV - port: 30031 + port: 30001 database: 1 lettuce: pool: @@ -17,7 +17,7 @@ spring: datasource: stock-market: driver-class-name: com.mysql.cj.jdbc.Driver - url: jdbc:mysql://43.156.40.39:30030/india_stock?useUnicode=true&characterEncoding=utf-8 + url: jdbc:mysql://43.153.174.179:30000/india_stock?useUnicode=true&characterEncoding=utf-8 username: root password: uNejHIFQGJOUtYTmE maxActive: 500