update crawl img news

This commit is contained in:
Congkiet4695
2024-09-02 13:46:22 +07:00
parent 5b2eb8519a
commit 4a29bf6659

View File

@@ -121,7 +121,8 @@ public class InvestingTask {
newsList.forEach( n -> {
String contentUrl = n.substring(1, n.indexOf("class=\"img-smllnews\"") - 2);
String id = contentUrl.substring(contentUrl.lastIndexOf("-") + 1, contentUrl.lastIndexOf("_"));
String imgUrl = n.substring(n.indexOf("img loading=\"lazy\" src=") + 24, n.indexOf("?"));
// String imgUrl = n.substring(n.indexOf("img loading=\"lazy\" src=") + 24, n.indexOf("?"));
String imgUrl = extractImgSrc(n);
// String time = n.substring(n.indexOf("Last Updated") + 23, n.indexOf("IST") - 9);
// Extract the date and time using regex
Pattern pattern = Pattern.compile("Updated On :<!-- --> <!-- -->(.*?)<!-- -->");
@@ -150,4 +151,22 @@ public class InvestingTask {
}
}
public static String extractImgSrc(String htmlData) {
String searchString = "<noscript>";
int startPos = htmlData.indexOf(searchString);
if (startPos != -1) {
String startTag = "src=";
startPos = htmlData.indexOf(startTag, startPos) + startTag.length() + 1;
int endPos = htmlData.indexOf("?", startPos);
if (endPos != -1) {
return htmlData.substring(startPos, endPos);
}
}
return null;
}
}