Merge branch 'bug/crawl_news' into 'develop'

update crawl

See merge request india/india_market_java!48
This commit is contained in:
vpckiet
2024-10-21 09:19:14 +00:00

View File

@@ -249,10 +249,17 @@ public class StockService {
String result = ""; String result = "";
try { try {
// 使用Jsoup连接到网页 // 使用Jsoup连接到网页
Document doc = Jsoup.connect("https://www.business-standard.com/markets/news") // Document doc = Jsoup.connect("https://www.business-standard.com/markets/news")
.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36") // .header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36")
.header("Referer", "https://www.business-standard.com/") // .header("Referer", "https://www.business-standard.com/")
// .header("Accept-Language", "en-US,en;q=0.9")
// .get();
String url = "https://www.business-standard.com/markets/news";
Document doc = Jsoup.connect(url)
.referrer("https://www.business-standard.com/")
.header("Accept-Language", "en-US,en;q=0.9") .header("Accept-Language", "en-US,en;q=0.9")
.userAgent("Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1")
.timeout(5000) // timeout 5 seconds
.get(); .get();
// result = doc.html().substring(doc.html().indexOf("<div class=\"listingstyle_shortvideoimg__0TWuX shortvideoimg\">"),doc.html().lastIndexOf("<div class=\"listingstyle_shortvideoimg__0TWuX shortvideoimg\">")+500); // result = doc.html().substring(doc.html().indexOf("<div class=\"listingstyle_shortvideoimg__0TWuX shortvideoimg\">"),doc.html().lastIndexOf("<div class=\"listingstyle_shortvideoimg__0TWuX shortvideoimg\">")+500);
Elements divElements = doc.select("div.listingstyle_cardlistlist__dfq57"); Elements divElements = doc.select("div.listingstyle_cardlistlist__dfq57");
@@ -272,10 +279,16 @@ public class StockService {
List<String> list = new ArrayList<>(); List<String> list = new ArrayList<>();
try { try {
// 使用Jsoup连接到网页 // 使用Jsoup连接到网页
// Document doc = Jsoup.connect(url)
// .header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36")
// .header("Referer", "https://www.business-standard.com/")
// .header("Accept-Language", "en-US,en;q=0.9")
// .get();
Document doc = Jsoup.connect(url) Document doc = Jsoup.connect(url)
.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36") .referrer("https://www.business-standard.com/")
.header("Referer", "https://www.business-standard.com/")
.header("Accept-Language", "en-US,en;q=0.9") .header("Accept-Language", "en-US,en;q=0.9")
.userAgent("Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1")
.timeout(5000) // timeout 5 seconds
.get(); .get();
result = doc.html().substring(doc.html().indexOf("articleBody") + 14, doc.html().indexOf(",\"author\":") - 1); result = doc.html().substring(doc.html().indexOf("articleBody") + 14, doc.html().indexOf(",\"author\":") - 1);
list.add(result); list.add(result);