feat: update news service to fetch Naver Stock news and enhance category handling

This commit is contained in:
2026-04-30 14:20:56 +08:00
parent 3d891d781c
commit aa9cf128d5
82 changed files with 414 additions and 87 deletions

View File

@@ -1,7 +1,4 @@
PORT=3100 PORT=3100
NEWS_API_KEY=03f614876f0645948cb9bbce1661f4b2
NEWS_API_BASE_URL=https://newsapi.org/v2/everything
NEWS_API_LANGUAGE=ko
NEWS_PAGE_SIZE=20 NEWS_PAGE_SIZE=20
NEWS_REFRESH_CRON=0 * * * * NEWS_REFRESH_CRON=0 * * * *
DATA_DIR=./data DATA_DIR=./data

8
.idea/.gitignore generated vendored Normal file
View File

@@ -0,0 +1,8 @@
# 默认忽略的文件
/shelf/
/workspace.xml
# 基于编辑器的 HTTP 客户端请求
/httpRequests/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml

View File

@@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Ask2AgentMigrationStateService">
<option name="migrationStatus" value="COMPLETED" />
</component>
</project>

View File

@@ -1,6 +1,6 @@
# News Service # News Service
独立新闻微服务,负责周期性从 NewsAPI 拉取新闻并落盘为 JSON 文件,再通过 HTTP 接口提供给前端。 独立新闻微服务,负责周期性从 Naver Stock 新闻接口拉取新闻并落盘为 JSON 文件,再通过 HTTP 接口提供给前端。
## 功能 ## 功能
@@ -12,20 +12,20 @@
## 分类 ## 分类
- finance - flashnews
- business - mainnews
- technology - ranknews
- market - worldnews
## 目录 ## 目录
```text ```text
news_service/ news_service/
data/ data/
finance.json flashnews.json
business.json mainnews.json
technology.json ranknews.json
market.json worldnews.json
src/ src/
app/ app/
config/ config/
@@ -76,15 +76,15 @@ GET /api/news/categories
### 获取指定分类新闻 ### 获取指定分类新闻
```http ```http
GET /api/news?category=finance&limit=10 GET /api/news?category=flashnews&limit=10
GET /api/news/finance?limit=10 GET /api/news/flashnews?limit=10
``` ```
### 兼容前端现有 NewsAPI 调用 ### 兼容前端现有 NewsAPI 调用
```http ```http
GET /v2/everything?q=finance&language=ko&pageSize=10&page=1 GET /v2/everything?q=flashnews&language=ko&pageSize=10&page=1
GET /v2/top-headlines?category=business&country=ko&pageSize=10 GET /v2/top-headlines?category=mainnews&country=ko&pageSize=10
``` ```
返回结构与前端当前使用的 NewsAPI 结构保持一致: 返回结构与前端当前使用的 NewsAPI 结构保持一致:
@@ -107,7 +107,7 @@ GET /api/news/all?limit=10
```http ```http
POST /api/news/refresh POST /api/news/refresh
POST /api/news/refresh?category=finance POST /api/news/refresh?category=flashnews
``` ```
## Docker ## Docker
@@ -123,8 +123,8 @@ cd /Users/wjp/Projects/juYou
docker compose -f docker-compose.news-stack.yml up -d --build docker compose -f docker-compose.news-stack.yml up -d --build
``` ```
这样前端容器中的 Nginx 会把同源路径 `/newsapi/*` 代理到容器网络中的 `news-service:3100`,浏览器不会直接访问 NewsAPI,因此不会触发跨域限制。 这样前端容器中的 Nginx 会把同源路径 `/newsapi/*` 代理到容器网络中的 `news-service:3100`,浏览器不会直接访问外部新闻接口,因此不会触发跨域限制。
## 前端接入建议 ## 前端接入建议
前端如果继续使用原来的 `/newsapi/v2/*` 请求方式,只需要把代理目标指向本服务即可,不需要修改新闻请求代码。 前端如果继续使用原来的 `/newsapi/v2/*` 请求方式,只需要把代理目标指向本服务即可,不需要修改新闻请求代码。

Binary file not shown.

After

Width:  |  Height:  |  Size: 89 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 97 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 54 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 101 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 163 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 183 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 707 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 210 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 40 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 91 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 781 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 152 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 209 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 226 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 165 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 54 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 26 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 32 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 146 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 194 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 479 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 512 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 635 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 170 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 44 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 28 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 146 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 68 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 331 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 262 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 121 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 13 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 27 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 162 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 879 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 36 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 123 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 100 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 367 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 54 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 59 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 405 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 614 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 12 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 323 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 330 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 849 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 584 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 46 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 136 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 99 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 998 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 47 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 133 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 63 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.1 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 108 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 660 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 103 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 428 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 155 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 196 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 29 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 12 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 49 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 33 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 69 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.4 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 80 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 332 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 35 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 962 KiB

View File

@@ -10,12 +10,9 @@ services:
- "3100:3100" - "3100:3100"
environment: environment:
PORT: 3100 PORT: 3100
NEWS_API_KEY: ${NEWS_API_KEY}
NEWS_API_BASE_URL: ${NEWS_API_BASE_URL:-https://newsapi.org/v2/everything}
NEWS_API_LANGUAGE: ${NEWS_API_LANGUAGE:-en}
NEWS_PAGE_SIZE: ${NEWS_PAGE_SIZE:-20} NEWS_PAGE_SIZE: ${NEWS_PAGE_SIZE:-20}
NEWS_REFRESH_CRON: ${NEWS_REFRESH_CRON:-0 * * * *} NEWS_REFRESH_CRON: ${NEWS_REFRESH_CRON:-0 * * * *}
DATA_DIR: ${DATA_DIR:-/app/data} DATA_DIR: ${DATA_DIR:-/app/data}
volumes: volumes:
- ./data:/app/data - ./data:/app/data
restart: unless-stopped restart: unless-stopped

View File

@@ -1,4 +1,5 @@
const express = require('express') const express = require('express')
const path = require('path')
const { CategoryNewsRepository } = require('../repositories/CategoryNewsRepository') const { CategoryNewsRepository } = require('../repositories/CategoryNewsRepository')
const { NewsApiClient } = require('../services/NewsApiClient') const { NewsApiClient } = require('../services/NewsApiClient')
const { NewsStorageService } = require('../services/NewsStorageService') const { NewsStorageService } = require('../services/NewsStorageService')
@@ -19,12 +20,10 @@ class NewsApplication {
this._logger = new ConsoleLogger() this._logger = new ConsoleLogger()
this._categoryCatalog = new NewsCategoryCatalog() this._categoryCatalog = new NewsCategoryCatalog()
this._repository = new CategoryNewsRepository(runtimeConfig.dataDirectory) this._repository = new CategoryNewsRepository(runtimeConfig.dataDirectory)
this._apiClient = new NewsApiClient( this._apiClient = new NewsApiClient({
runtimeConfig.apiBaseUrl, pageSize: runtimeConfig.pageSize,
runtimeConfig.apiKey, dataDirectory: runtimeConfig.dataDirectory
runtimeConfig.language, })
runtimeConfig.pageSize
)
this._storageService = new NewsStorageService(this._categoryCatalog, this._repository, this._apiClient) this._storageService = new NewsStorageService(this._categoryCatalog, this._repository, this._apiClient)
this._scheduler = new NewsRefreshScheduler(this._storageService, runtimeConfig.refreshCron, this._logger) this._scheduler = new NewsRefreshScheduler(this._storageService, runtimeConfig.refreshCron, this._logger)
this._express = express() this._express = express()
@@ -69,6 +68,7 @@ class NewsApplication {
*/ */
_configureHttpPipeline() { _configureHttpPipeline() {
this._express.use(express.json()) this._express.use(express.json())
this._express.use('/api/news/images', express.static(path.join(this._runtimeConfig.dataDirectory, 'images')))
this._express.get('/health', (_request, response) => { this._express.get('/health', (_request, response) => {
response.json({ response.json({
@@ -95,4 +95,4 @@ class NewsApplication {
module.exports = { module.exports = {
NewsApplication NewsApplication
} }

View File

@@ -9,12 +9,16 @@ class NewsCategory {
* @param {string} query * @param {string} query
* @param {string} fileName * @param {string} fileName
* @param {string} label * @param {string} label
* @param {string} apiUrl
* @param {boolean} worldNews
*/ */
constructor(key, query, fileName, label) { constructor(key, query, fileName, label, apiUrl, worldNews = false) {
this.key = key this.key = key
this.query = query this.query = query
this.fileName = fileName this.fileName = fileName
this.label = label this.label = label
this.apiUrl = apiUrl
this.worldNews = worldNews
} }
} }
@@ -26,11 +30,22 @@ class NewsCategory {
class NewsCategoryCatalog { class NewsCategoryCatalog {
constructor() { constructor() {
this._categories = [ this._categories = [
new NewsCategory('finance', 'finance', 'finance.json', 'Finance'), new NewsCategory('flashnews', 'flashnews', 'flashnews.json', 'Realtime News', 'https://m.stock.naver.com/front-api/news/category?category=flashnews&pageSize=20&page=1'),
new NewsCategory('business', 'business', 'business.json', 'Business'), new NewsCategory('mainnews', 'mainnews', 'mainnews.json', 'Main News', 'https://m.stock.naver.com/front-api/news/category?category=mainnews&pageSize=20&page=1'),
new NewsCategory('technology', 'technology', 'technology.json', 'Technology'), new NewsCategory('ranknews', 'ranknews', 'ranknews.json', 'Rank News', 'https://m.stock.naver.com/front-api/news/category?category=ranknews&pageSize=20&page=1'),
new NewsCategory('market', 'market', 'market.json', 'Market') new NewsCategory('worldnews', 'worldnews', 'worldnews.json', 'World News', 'https://m.stock.naver.com/front-api/news/worldnews?pageSize=20&page=1', true)
] ]
this._aliases = new Map([
['finance', 'flashnews'],
['business', 'mainnews'],
['technology', 'ranknews'],
['market', 'worldnews'],
['flash', 'flashnews'],
['main', 'mainnews'],
['rank', 'ranknews'],
['ranks', 'ranknews'],
['world', 'worldnews']
])
} }
/** /**
@@ -45,11 +60,12 @@ class NewsCategoryCatalog {
* @returns {NewsCategory | null} * @returns {NewsCategory | null}
*/ */
getByKey(key) { getByKey(key) {
return this._categories.find((item) => item.key === key) || null const normalizedKey = this._aliases.get(key) || key
return this._categories.find((item) => item.key === normalizedKey) || null
} }
} }
module.exports = { module.exports = {
NewsCategory, NewsCategory,
NewsCategoryCatalog NewsCategoryCatalog
} }

View File

@@ -11,14 +11,11 @@ class ServiceConfig {
} }
/** /**
* @returns {{port: number, apiKey: string, apiBaseUrl: string, language: string, pageSize: number, refreshCron: string, dataDirectory: string}} * @returns {{port: number, pageSize: number, refreshCron: string, dataDirectory: string}}
*/ */
toRuntimeConfig() { toRuntimeConfig() {
return { return {
port: Number(this._env.PORT || 3100), port: Number(this._env.PORT || 3100),
apiKey: this._env.NEWS_API_KEY || '',
apiBaseUrl: this._env.NEWS_API_BASE_URL || 'https://newsapi.org/v2/everything',
language: this._env.NEWS_API_LANGUAGE || 'en',
pageSize: Number(this._env.NEWS_PAGE_SIZE || 20), pageSize: Number(this._env.NEWS_PAGE_SIZE || 20),
refreshCron: this._env.NEWS_REFRESH_CRON || '0 * * * *', refreshCron: this._env.NEWS_REFRESH_CRON || '0 * * * *',
dataDirectory: path.resolve(process.cwd(), this._env.DATA_DIR || './data') dataDirectory: path.resolve(process.cwd(), this._env.DATA_DIR || './data')
@@ -28,4 +25,4 @@ class ServiceConfig {
module.exports = { module.exports = {
ServiceConfig ServiceConfig
} }

View File

@@ -1,14 +1,25 @@
const crypto = require('crypto')
const fs = require('fs/promises')
const path = require('path')
const NAVER_ARTICLE_URL = 'https://n.news.naver.com/article/'
const NAVER_WORLD_NEWS_URL = 'https://m.stock.naver.com/investment/news/worldnews/'
/** /**
* NewsApiClient * NewsApiClient
* *
* Wraps outbound requests to NewsAPI. * Fetches Naver Stock news and normalizes it to the existing NewsAPI-shaped
* article contract consumed by the front-end compatibility routes.
*/ */
class NewsApiClient { class NewsApiClient {
constructor(apiBaseUrl, apiKey, language, pageSize) { constructor(apiBaseUrlOrOptions, _apiKey, _language, pageSize) {
this._apiBaseUrl = apiBaseUrl const options = typeof apiBaseUrlOrOptions === 'object'
this._apiKey = apiKey ? apiBaseUrlOrOptions
this._language = language : { pageSize }
this._pageSize = pageSize
this._pageSize = options.pageSize || 20
this._fetch = options.fetchImplementation || fetch
this._imageStorage = options.imageStorage || new NewsImageStorage(options.dataDirectory)
} }
/** /**
@@ -16,54 +27,348 @@ class NewsApiClient {
* @returns {Promise<object[]>} * @returns {Promise<object[]>}
*/ */
async fetchArticlesByCategory(category) { async fetchArticlesByCategory(category) {
if (!this._apiKey) { const listUrl = this._createListUrl(category)
throw new Error('NEWS_API_KEY is required') const response = await this._fetch(listUrl)
}
const url = new URL(this._apiBaseUrl)
url.searchParams.set('q', category.query)
url.searchParams.set('language', this._language)
url.searchParams.set('pageSize', String(this._pageSize))
url.searchParams.set('page', '1')
url.searchParams.set('sortBy', 'publishedAt')
const response = await fetch(url, {
headers: {
'X-Api-Key': this._apiKey
}
})
if (!response.ok) { if (!response.ok) {
const failure = await response.json().catch(() => ({})) throw new Error(`Naver news request failed with status ${response.status}`)
throw new Error(failure.message || `NewsAPI request failed with status ${response.status}`)
} }
const payload = await response.json() const payload = await response.json()
if (payload.status !== 'ok') { const items = this._extractNewsItems(payload)
throw new Error(payload.message || 'NewsAPI returned a non-ok payload')
}
return (payload.articles || []).map((article) => this._normalizeArticle(article)) const articles = []
for (const item of items) {
const article = await this._normalizeArticle(item, category)
if (article) {
articles.push(article)
}
}
return articles
} }
/** /**
* @param {object} article * @param {import('../config/CategoryCatalog').NewsCategory} category
* @returns {object} * @returns {string}
*/ */
_normalizeArticle(article) { _createListUrl(category) {
return { const url = new URL(category.apiUrl)
source: article.source || null, url.searchParams.set('pageSize', String(this._pageSize))
author: article.author || null, url.searchParams.set('page', '1')
title: article.title || '', return url.toString()
description: article.description || '', }
url: article.url || '',
urlToImage: article.urlToImage || '', /**
publishedAt: article.publishedAt || null, * @param {object} payload
content: article.content || '' * @returns {object[]}
*/
_extractNewsItems(payload) {
if (Array.isArray(payload)) {
return payload
} }
const directCandidates = [
payload?.result?.newsList,
payload?.result?.items,
payload?.result?.list,
payload?.result,
payload?.items,
payload?.newsList,
payload?.list
]
const direct = directCandidates.find((candidate) => Array.isArray(candidate))
if (direct) {
return direct
}
return this._findFirstNewsArray(payload) || []
}
/**
* @param {unknown} value
* @returns {object[] | null}
*/
_findFirstNewsArray(value) {
if (!value || typeof value !== 'object') {
return null
}
if (Array.isArray(value)) {
return value.some((item) => item && typeof item === 'object' && this._getSourceId(item))
? value
: null
}
for (const child of Object.values(value)) {
const found = this._findFirstNewsArray(child)
if (found) {
return found
}
}
return null
}
/**
* @param {object} item
* @param {import('../config/CategoryCatalog').NewsCategory} category
* @returns {Promise<object | null>}
*/
async _normalizeArticle(item, category) {
const sourceId = this._getSourceId(item)
if (!sourceId) {
return null
}
const detailUrl = this._createDetailUrl(category, sourceId)
const content = await this._fetchArticleContent(detailUrl, category.worldNews)
const title = this._firstString(item.titleFull, item.title, item.newsTitle, item.articleTitle, item.headline)
const sourceName = this._firstString(item.officeName, item.pressName, item.providerName, item.sourceName)
return {
source: {
id: sourceId.split('/')[0] || null,
name: sourceName || null
},
author: this._firstString(item.author, item.writerName, item.reporter) || null,
title,
description: this._firstString(item.summary, item.description, item.body, item.subTitle) || '',
url: detailUrl,
urlToImage: this._firstString(item.imageUrl, item.thumbnailUrl, item.thumbnail, item.imageOriginLink) || '',
publishedAt: normalizePublishedAt(this._firstString(item.datetime, item.dateTime, item.publishedAt, item.officeDateTime, item.regDate)) || null,
content
}
}
/**
* @param {object} item
* @returns {string}
*/
_getSourceId(item) {
const sourceId = this._firstString(item.sourceId, item.articleSourceId)
if (sourceId) {
return sourceId.replace(/^\/+/, '')
}
const officeId = this._firstString(item.officeId, item.pressId)
const articleId = this._firstString(item.articleId, item.newsId)
if (officeId && articleId) {
return `${officeId}/${articleId}`
}
return ''
}
/**
* @param {import('../config/CategoryCatalog').NewsCategory} category
* @param {string} sourceId
* @returns {string}
*/
_createDetailUrl(category, sourceId) {
const baseUrl = category.worldNews ? NAVER_WORLD_NEWS_URL : NAVER_ARTICLE_URL
return `${baseUrl}${sourceId}`
}
/**
* @param {string} detailUrl
* @param {boolean} worldNews
* @returns {Promise<string>}
*/
async _fetchArticleContent(detailUrl, worldNews) {
let response
try {
response = await this._fetch(detailUrl)
} catch (_error) {
return ''
}
if (!response.ok) {
return ''
}
const html = await response.text()
const content = worldNews
? this._extractElementHtmlById(html, 'content')
: this._extractTagHtml(html, 'article')
return this._replaceContentImages(content || '', detailUrl)
}
/**
* @param {string} html
* @param {string} tagName
* @returns {string}
*/
_extractTagHtml(html, tagName) {
const expression = new RegExp(`<${tagName}\\b[^>]*>[\\s\\S]*?<\\/${tagName}>`, 'i')
const match = html.match(expression)
return match ? match[0] : ''
}
/**
* @param {string} html
* @param {string} id
* @returns {string}
*/
_extractElementHtmlById(html, id) {
const startExpression = new RegExp(`<([a-z][\\w:-]*)\\b(?=[^>]*\\bid=["']${escapeRegExp(id)}["'])[^>]*>`, 'i')
const startMatch = startExpression.exec(html)
if (!startMatch) {
return ''
}
const tagName = startMatch[1]
const startIndex = startMatch.index
const openTagEnd = startIndex + startMatch[0].length
const tokenExpression = new RegExp(`<\\/?${escapeRegExp(tagName)}\\b[^>]*>`, 'ig')
tokenExpression.lastIndex = openTagEnd
let depth = 1
let tokenMatch
while ((tokenMatch = tokenExpression.exec(html)) !== null) {
if (tokenMatch[0][1] === '/') {
depth -= 1
if (depth === 0) {
return html.slice(startIndex, tokenExpression.lastIndex)
}
} else {
depth += 1
}
}
return ''
}
/**
* @param {string} html
* @returns {Promise<string>}
*/
async _replaceContentImages(html, baseUrl) {
const imageExpression = /<img\b[^>]*\bsrc=["']([^"']+)["'][^>]*>/ig
const replacements = []
let match
while ((match = imageExpression.exec(html)) !== null) {
replacements.push({
originalTag: match[0],
originalUrl: match[1]
})
}
let updatedHtml = html
for (const replacement of replacements) {
const localUrl = await this._downloadImage(replacement.originalUrl, baseUrl)
if (localUrl) {
updatedHtml = updatedHtml.replace(
replacement.originalTag,
replacement.originalTag.replace(replacement.originalUrl, localUrl)
)
}
}
return updatedHtml
}
/**
* @param {string} imageUrl
* @returns {Promise<string>}
*/
async _downloadImage(imageUrl, baseUrl) {
const absoluteImageUrl = toAbsoluteHttpUrl(imageUrl, baseUrl)
if (!absoluteImageUrl) {
return ''
}
let response
try {
response = await this._fetch(absoluteImageUrl)
} catch (_error) {
return ''
}
if (!response.ok) {
return ''
}
const contentType = response.headers?.get?.('content-type') || 'application/octet-stream'
const buffer = Buffer.from(await response.arrayBuffer())
return this._imageStorage.saveImage(absoluteImageUrl, buffer, contentType)
}
/**
* @param {...unknown} values
* @returns {string}
*/
_firstString(...values) {
const value = values.find((item) => typeof item === 'string' && item.trim())
return value ? value.trim() : ''
} }
} }
class NewsImageStorage {
constructor(dataDirectory) {
this._imageDirectory = path.resolve(dataDirectory || './data', 'images')
}
/**
* @param {string} imageUrl
* @param {Buffer} buffer
* @param {string} contentType
* @returns {Promise<string>}
*/
async saveImage(imageUrl, buffer, contentType) {
await fs.mkdir(this._imageDirectory, { recursive: true })
const extension = extensionFromContentType(contentType) || extensionFromUrl(imageUrl) || '.bin'
const digest = crypto.createHash('sha256').update(imageUrl).digest('hex').slice(0, 24)
const fileName = `${digest}${extension}`
await fs.writeFile(path.join(this._imageDirectory, fileName), buffer)
return `/api/news/images/${fileName}`
}
}
function extensionFromContentType(contentType) {
const normalized = String(contentType || '').split(';')[0].trim().toLowerCase()
const byContentType = {
'image/jpeg': '.jpg',
'image/jpg': '.jpg',
'image/png': '.png',
'image/gif': '.gif',
'image/webp': '.webp',
'image/svg+xml': '.svg'
}
return byContentType[normalized] || ''
}
function extensionFromUrl(imageUrl) {
try {
const extension = path.extname(new URL(imageUrl).pathname).toLowerCase()
return extension.length <= 6 ? extension : ''
} catch (_error) {
return ''
}
}
function toAbsoluteHttpUrl(value, baseUrl) {
try {
const url = new URL(value, baseUrl)
return ['http:', 'https:'].includes(url.protocol) ? url.toString() : ''
} catch (_error) {
return ''
}
}
function normalizePublishedAt(value) {
if (!value) {
return ''
}
if (/^\d{14}$/.test(value)) {
const year = value.slice(0, 4)
const month = value.slice(4, 6)
const day = value.slice(6, 8)
const hour = value.slice(8, 10)
const minute = value.slice(10, 12)
const second = value.slice(12, 14)
return `${year}-${month}-${day}T${hour}:${minute}:${second}+09:00`
}
return value
}
function escapeRegExp(value) {
return String(value).replace(/[.*+?^${}()|[\]\\]/g, '\\$&')
}
module.exports = { module.exports = {
NewsApiClient NewsApiClient,
} NewsImageStorage
}

View File

@@ -86,7 +86,8 @@ class NewsStorageService {
key: category.key, key: category.key,
label: category.label, label: category.label,
query: category.query, query: category.query,
fileName: category.fileName fileName: category.fileName,
apiUrl: category.apiUrl
})) }))
} }
@@ -121,4 +122,4 @@ class NewsStorageService {
module.exports = { module.exports = {
NewsStorageService NewsStorageService
} }