Merge branch 'bug/craw_img_news' into 'develop'

update crawl img news

See merge request india/india_market_java!33
This commit is contained in:
vpckiet
2024-09-02 06:47:52 +00:00

View File

@@ -121,7 +121,8 @@ public class InvestingTask {
newsList.forEach( n -> { newsList.forEach( n -> {
String contentUrl = n.substring(1, n.indexOf("class=\"img-smllnews\"") - 2); String contentUrl = n.substring(1, n.indexOf("class=\"img-smllnews\"") - 2);
String id = contentUrl.substring(contentUrl.lastIndexOf("-") + 1, contentUrl.lastIndexOf("_")); String id = contentUrl.substring(contentUrl.lastIndexOf("-") + 1, contentUrl.lastIndexOf("_"));
String imgUrl = n.substring(n.indexOf("img loading=\"lazy\" src=") + 24, n.indexOf("?")); // String imgUrl = n.substring(n.indexOf("img loading=\"lazy\" src=") + 24, n.indexOf("?"));
String imgUrl = extractImgSrc(n);
// String time = n.substring(n.indexOf("Last Updated") + 23, n.indexOf("IST") - 9); // String time = n.substring(n.indexOf("Last Updated") + 23, n.indexOf("IST") - 9);
// Extract the date and time using regex // Extract the date and time using regex
Pattern pattern = Pattern.compile("Updated On :<!-- --> <!-- -->(.*?)<!-- -->"); Pattern pattern = Pattern.compile("Updated On :<!-- --> <!-- -->(.*?)<!-- -->");
@@ -150,4 +151,22 @@ public class InvestingTask {
} }
} }
public static String extractImgSrc(String htmlData) {
String searchString = "<noscript>";
int startPos = htmlData.indexOf(searchString);
if (startPos != -1) {
String startTag = "src=";
startPos = htmlData.indexOf(startTag, startPos) + startTag.length() + 1;
int endPos = htmlData.indexOf("?", startPos);
if (endPos != -1) {
return htmlData.substring(startPos, endPos);
}
}
return null;
}
} }