From 4a29bf6659dcf7222ae535d3c1e08eb2d3bb2688 Mon Sep 17 00:00:00 2001 From: Congkiet4695 Date: Mon, 2 Sep 2024 13:46:22 +0700 Subject: [PATCH] update crawl img news --- .../infrastructure/job/InvestingTask.java | 21 ++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/src/main/java/cn/stock/market/infrastructure/job/InvestingTask.java b/src/main/java/cn/stock/market/infrastructure/job/InvestingTask.java index e93d0da..52d9c69 100644 --- a/src/main/java/cn/stock/market/infrastructure/job/InvestingTask.java +++ b/src/main/java/cn/stock/market/infrastructure/job/InvestingTask.java @@ -121,7 +121,8 @@ public class InvestingTask { newsList.forEach( n -> { String contentUrl = n.substring(1, n.indexOf("class=\"img-smllnews\"") - 2); String id = contentUrl.substring(contentUrl.lastIndexOf("-") + 1, contentUrl.lastIndexOf("_")); - String imgUrl = n.substring(n.indexOf("img loading=\"lazy\" src=") + 24, n.indexOf("?")); +// String imgUrl = n.substring(n.indexOf("img loading=\"lazy\" src=") + 24, n.indexOf("?")); + String imgUrl = extractImgSrc(n); // String time = n.substring(n.indexOf("Last Updated") + 23, n.indexOf("IST") - 9); // Extract the date and time using regex Pattern pattern = Pattern.compile("Updated On : (.*?)"); @@ -150,4 +151,22 @@ public class InvestingTask { } } + + public static String extractImgSrc(String htmlData) { + String searchString = "