update news

This commit is contained in:
vu-tran
2025-05-13 10:10:42 +07:00
parent c72c0ff4cd
commit 90a26823f0
3 changed files with 141 additions and 32 deletions

View File

@@ -255,47 +255,106 @@ public class StockService {
// .header("Accept-Language", "en-US,en;q=0.9")
// .get();
String url = "https://www.business-standard.com/markets/news";
Document doc = Jsoup.connect(url)
.referrer("https://www.business-standard.com/")
.header("Accept-Language", "en-US,en;q=0.9")
.userAgent("Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1")
.timeout(5000) // timeout 5 seconds
.get();
// result = doc.html().substring(doc.html().indexOf("<div class=\"listingstyle_shortvideoimg__0TWuX shortvideoimg\">"),doc.html().lastIndexOf("<div class=\"listingstyle_shortvideoimg__0TWuX shortvideoimg\">")+500);
Elements divElements = doc.select("div.listingstyle_cardlistlist__dfq57");
StringBuilder sb = new StringBuilder();
for (Element divElement : divElements) {
sb.append(divElement.outerHtml()).append("\n");
}
result = sb.toString();
// Document doc = Jsoup.connect(url)
// .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
// .header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
// .header("Accept-Language", "en-US,en;q=0.5")
// .header("Connection", "keep-alive")
// .referrer("https://www.google.com")
// .ignoreHttpErrors(true)
// .timeout(10000)
// .get();
String doGetNews = HttpClientRequest.doGetNews(url);
// result = doc.html().substring(doc.html().indexOf("<div class=\"listingstyle_shortvideoimg__0TWuX shortvideoimg\">"),doc.html().lastIndexOf("<div class=\"listingstyle_shortvideoimg__0TWuX shortvideoimg\">")+500);
// Elements divElements = doc.select("div.listingstyle_cardlistlist__dfq57");
// StringBuilder sb = new StringBuilder();
// for (Element divElement : divElements) {
// sb.append(divElement.outerHtml()).append("\n");
// }
// result = sb.toString();
result = extractNewsFromHtml(doGetNews);
} catch (Exception e) {
return e.toString();
}
return result;
}
private String extractNewsFromHtml(String rawHtml) {
Document doc = Jsoup.parse(rawHtml);
// Optional: extract raw block around <div class="listingstyle_shortvideoimg__0TWuX shortvideoimg">
int start = rawHtml.indexOf("<div class=\"listingstyle_shortvideoimg__0TWuX shortvideoimg\">");
int end = rawHtml.lastIndexOf("<div class=\"listingstyle_shortvideoimg__0TWuX shortvideoimg\">") + 500;
String result = (start != -1 && end > start) ? rawHtml.substring(start, end) : "";
// Extract article list using CSS selector
Elements divElements = doc.select("div.listingstyle_cardlistlist__dfq57");
StringBuilder sb = new StringBuilder();
for (Element divElement : divElements) {
sb.append(divElement.outerHtml()).append("\n");
}
// If you only want the div content:
result = sb.toString();
return result;
}
public List<String> getNewsInfo(String url) {
// String result = "";
// List<String> list = new ArrayList<>();
// try {
// // 使用Jsoup连接到网页
//// Document doc = Jsoup.connect(url)
//// .header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36")
//// .header("Referer", "https://www.business-standard.com/")
//// .header("Accept-Language", "en-US,en;q=0.9")
//// .get();
// Document doc = Jsoup.connect(url)
// .referrer("https://www.business-standard.com/")
// .header("Accept-Language", "en-US,en;q=0.9")
// .userAgent("Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1")
// .timeout(5000) // timeout 5 seconds
// .get();
// result = doc.html().substring(doc.html().indexOf("articleBody") + 14, doc.html().indexOf(",\"author\":") - 1);
// list.add(result);
// list.add(doc.html().substring(doc.html().indexOf("og:title") + 19, doc.html().indexOf("<meta property=\"og:url") - 5));
// } catch (Exception e) {
// list.add(e.toString());
// return list;
// }
// return list;
String result = "";
List<String> list = new ArrayList<>();
try {
// 使用Jsoup连接到网页
// Document doc = Jsoup.connect(url)
// .header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36")
// .header("Referer", "https://www.business-standard.com/")
// .header("Accept-Language", "en-US,en;q=0.9")
// .get();
Document doc = Jsoup.connect(url)
.referrer("https://www.business-standard.com/")
.header("Accept-Language", "en-US,en;q=0.9")
.userAgent("Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1")
.timeout(5000) // timeout 5 seconds
.get();
result = doc.html().substring(doc.html().indexOf("articleBody") + 14, doc.html().indexOf(",\"author\":") - 1);
// Step 1: Get raw HTML via HttpClient
String rawHtml = HttpClientRequest.doGetNews(url);
// Step 2: Parse HTML with Jsoup
Document doc = Jsoup.parse(rawHtml);
// Step 3: Extract content manually (like original)
int startIdx = rawHtml.indexOf("articleBody") + 14;
int endIdx = rawHtml.indexOf(",\"author\":") - 1;
if (startIdx > 0 && endIdx > startIdx) {
result = rawHtml.substring(startIdx, endIdx);
} else {
result = "[articleBody not found]";
}
list.add(result);
list.add(doc.html().substring(doc.html().indexOf("og:title") + 19, doc.html().indexOf("<meta property=\"og:url") - 5));
// Step 4: Extract og:title
int titleStart = rawHtml.indexOf("og:title") + 19;
int titleEnd = rawHtml.indexOf("<meta property=\"og:url") - 5;
if (titleStart > 0 && titleEnd > titleStart) {
list.add(rawHtml.substring(titleStart, titleEnd));
} else {
list.add("[title not found]");
}
} catch (Exception e) {
list.add(e.toString());
return list;
list.add("[Error] " + e.getMessage());
}
return list;
}

View File

@@ -265,6 +265,56 @@ public class HttpClientRequest {
return result;
}
public static String doGetNews(String url) {
CloseableHttpClient httpClient = null;
CloseableHttpResponse response = null;
String result = "";
try {
httpClient = HttpClients.createDefault();
HttpGet httpGet = new HttpGet(url);
// Spoof real browser headers
httpGet.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7");
httpGet.setHeader("Accept-Language", "en-US,en;q=0.9,vi;q=0.8,ug;q=0.7,fr;q=0.6");
httpGet.setHeader("User-Agent", "Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1");
httpGet.setHeader("Priority", "u=0, i");
httpGet.setHeader("Sec-Fetch-Dest", "document");
httpGet.setHeader("Sec-Fetch-Mode", "navigate");
httpGet.setHeader("Sec-Fetch-Site", "none");
httpGet.setHeader("Sec-Fetch-User", "?1");
httpGet.setHeader("Upgrade-Insecure-Requests", "1");
// Set cookies exactly like in curl
httpGet.setHeader("Cookie", "userUid=1747102033185-d570fba9-62fd-40be-93ca-ed08b4de57d4; _sid=MTc0NzEwMjAzMzE4Ni4ycTU%3D; _scor_uid=135c13065ff84620b5318b489af93e87; _gcl_au=1.1.249135292.1747102036; _ga=GA1.1.1031614211.1747102037; WZRK_G=28895afb56ff48dda59fe8de0af746bf; FCNEC=%5B%5B%22AKsRol8sHYeSYz_FYPkInYXN3P4ZDPfVKbsRfILfDuOMLhDtkTuoCJP5MlvT9gIbOe7IlDfY8ZeHszhwdVtAoKF1gWv0pLAq5EqpLpse8CEm_ZNv-bUSs6zEyqpOkeKWFI_Ei6VfNAvnZAO8PcXdF8_ncsaO902X7g%3D%3D%22%5D%5D; _ga_KRGL1M61LX=GS2.1.s1747105135$o2$g0$t1747105135$j60$l0$h0");
RequestConfig requestConfig = RequestConfig.custom()
.setConnectTimeout(10000)
.setSocketTimeout(15000)
.setConnectionRequestTimeout(10000)
.build();
httpGet.setConfig(requestConfig);
response = httpClient.execute(httpGet);
HttpEntity entity = response.getEntity();
if (entity != null) {
result = EntityUtils.toString(entity);
}
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
if (response != null) response.close();
if (httpClient != null) httpClient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return result;
}
public static void main(String[] args) {
String url = "https://marketapi.intoday.in/widget/topgainer/view?exchange=nse";
String str = doGet(url);

View File

@@ -3,9 +3,9 @@ spring:
show-sql: true
# Redis配置
redis:
host: 43.156.40.39
host: 43.153.174.179
password: a5v8b86P4mVzFlUqJV
port: 30031
port: 30001
database: 1
lettuce:
pool:
@@ -17,7 +17,7 @@ spring:
datasource:
stock-market:
driver-class-name: com.mysql.cj.jdbc.Driver
url: jdbc:mysql://43.156.40.39:30030/india_stock?useUnicode=true&characterEncoding=utf-8
url: jdbc:mysql://43.153.174.179:30000/india_stock?useUnicode=true&characterEncoding=utf-8
username: root
password: uNejHIFQGJOUtYTmE
maxActive: 500