feat: pin a fixed article to the top of news results

Add PinnedArticleService that always returns configured Naver articles
first to the front-end. Articles are built from a bare URL via a new
NewsApiClient.buildArticleFromUrl (OG/meta parsing + existing content and
image-localization pipeline), cached to disk with a TTL, and prepended
with de-duplication in NewsStorageService. Configurable via
NEWS_PINNED_ARTICLE_URLS / NEWS_PINNED_CATEGORIES / NEWS_PINNED_TTL_MS.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-04 16:33:17 +08:00
parent b57b64ef95
commit 08cdfe7607
6 changed files with 407 additions and 14 deletions

View File

@@ -2,3 +2,12 @@ PORT=3100
NEWS_PAGE_SIZE=20
NEWS_REFRESH_CRON=0 * * * *
DATA_DIR=./data
# Pinned articles: always returned first, in order, to the front-end.
# Comma-separated Naver article URLs.
NEWS_PINNED_ARTICLE_URLS=https://n.news.naver.com/mnews/article/009/0005689327
# Categories the pins apply to (comma-separated keys: flashnews,mainnews,ranknews,worldnews
# or front-end aliases finance,business,technology,market). Leave empty for all categories.
NEWS_PINNED_CATEGORIES=
# How long (ms) a built pin stays cached before a rebuild is attempted. Default 6h.
NEWS_PINNED_TTL_MS=21600000

View File

@@ -3,6 +3,7 @@ const path = require('path')
const { CategoryNewsRepository } = require('../repositories/CategoryNewsRepository')
const { NewsApiClient } = require('../services/NewsApiClient')
const { NewsStorageService } = require('../services/NewsStorageService')
const { PinnedArticleService } = require('../services/PinnedArticleService')
const { NewsRefreshScheduler } = require('../services/NewsRefreshScheduler')
const { NewsController } = require('../controllers/NewsController')
const { NewsApiCompatibilityController } = require('../controllers/NewsApiCompatibilityController')
@@ -24,7 +25,18 @@ class NewsApplication {
pageSize: runtimeConfig.pageSize,
dataDirectory: runtimeConfig.dataDirectory
})
this._storageService = new NewsStorageService(this._categoryCatalog, this._repository, this._apiClient)
this._pinnedArticleService = new PinnedArticleService(this._apiClient, {
urls: runtimeConfig.pinnedArticleUrls,
categoryKeys: runtimeConfig.pinnedCategoryKeys,
dataDirectory: runtimeConfig.dataDirectory,
ttlMs: runtimeConfig.pinnedTtlMs
})
this._storageService = new NewsStorageService(
this._categoryCatalog,
this._repository,
this._apiClient,
this._pinnedArticleService
)
this._scheduler = new NewsRefreshScheduler(this._storageService, runtimeConfig.refreshCron, this._logger)
this._express = express()
this._server = null

View File

@@ -11,16 +11,38 @@ class ServiceConfig {
}
/**
* @returns {{port: number, pageSize: number, refreshCron: string, dataDirectory: string}}
* @returns {{port: number, pageSize: number, refreshCron: string, dataDirectory: string, pinnedArticleUrls: string[], pinnedCategoryKeys: string[] | null, pinnedTtlMs: number}}
*/
toRuntimeConfig() {
return {
port: Number(this._env.PORT || 3100),
pageSize: Number(this._env.NEWS_PAGE_SIZE || 20),
refreshCron: this._env.NEWS_REFRESH_CRON || '0 * * * *',
dataDirectory: path.resolve(process.cwd(), this._env.DATA_DIR || './data')
dataDirectory: path.resolve(process.cwd(), this._env.DATA_DIR || './data'),
pinnedArticleUrls: this._parseList(this._env.NEWS_PINNED_ARTICLE_URLS, [
'https://n.news.naver.com/mnews/article/009/0005689327'
]),
pinnedCategoryKeys: this._parseList(this._env.NEWS_PINNED_CATEGORIES, null),
pinnedTtlMs: Number(this._env.NEWS_PINNED_TTL_MS || 6 * 60 * 60 * 1000)
}
}
/**
* Parses a comma-separated environment value into a trimmed list.
*
* @param {string | undefined} value
* @param {string[] | null} fallback - Returned when the value is unset.
* @returns {string[] | null}
*/
_parseList(value, fallback) {
if (value === undefined || value === null || value.trim() === '') {
return fallback
}
return value
.split(',')
.map((item) => item.trim())
.filter(Boolean)
}
}
module.exports = {

View File

@@ -188,17 +188,144 @@ class NewsApiClient {
return ''
}
const html = await response.text()
const content = worldNews
? this._extractElementHtmlById(html, 'content')
: this._extractTagHtml(html, 'article')
const normalizedContent = await this._replaceContentImages(content || '', detailUrl)
const normalizedContent = await this._extractContentFromHtml(html, detailUrl, worldNews)
if (normalizedContent) {
await this._detailCache.set(detailUrl, normalizedContent)
}
return normalizedContent
}
/**
* Extracts the article body from already-downloaded detail HTML and rewrites
* its inline images to locally-proxied URLs.
*
* @param {string} html
* @param {string} detailUrl
* @param {boolean} worldNews
* @returns {Promise<string>}
*/
async _extractContentFromHtml(html, detailUrl, worldNews) {
const content = worldNews
? this._extractElementHtmlById(html, 'content')
: this._extractTagHtml(html, 'article')
return this._replaceContentImages(content || '', detailUrl)
}
/**
* Builds a normalized article object from a bare Naver article URL.
*
* Unlike {@link fetchArticlesByCategory}, no list payload is available, so the
* article metadata (title, source, publish time, image) is parsed from the
* detail page's Open Graph / meta tags. Used to pin a specific article.
*
* @param {string} articleUrl
* @returns {Promise<object | null>}
*/
async buildArticleFromUrl(articleUrl) {
const sourceId = this._extractSourceIdFromUrl(articleUrl)
if (!sourceId) {
return null
}
const detailUrl = `${NAVER_ARTICLE_URL}${sourceId}`
let response
try {
response = await this._fetch(detailUrl)
} catch (_error) {
return null
}
if (!response.ok) {
return null
}
const html = await response.text()
const content = await this._extractContentFromHtml(html, detailUrl, false)
const title = this._extractMeta(html, 'og:title')
if (!title) {
return null
}
return {
source: {
id: sourceId.split('/')[0] || null,
name: this._cleanSourceName(
this._extractMeta(html, 'og:article:author')
|| this._extractMeta(html, 'twitter:creator')
|| this._extractMeta(html, 'og:site_name')
) || null
},
author: null,
title,
description: this._extractMeta(html, 'og:description') || this._extractMeta(html, 'description') || '',
url: detailUrl,
urlToImage: this._extractMeta(html, 'og:image') || '',
publishedAt: normalizePublishedAt(this._extractArticleDateTime(html)) || null,
content
}
}
/**
* Parses the `{officeId}/{articleId}` source id from any Naver article URL,
* tolerating the `/article/`, `/mnews/article/` and query-string variants.
*
* @param {string} articleUrl
* @returns {string}
*/
_extractSourceIdFromUrl(articleUrl) {
const match = String(articleUrl || '').match(/article\/(\d+)\/(\d+)/)
return match ? `${match[1]}/${match[2]}` : ''
}
/**
* Reads a `<meta>` content value by `property`/`name`, regardless of attribute
* order, and decodes HTML entities.
*
* @param {string} html
* @param {string} key
* @returns {string}
*/
_extractMeta(html, key) {
const escapedKey = escapeRegExp(key)
const patterns = [
new RegExp(`<meta[^>]*(?:property|name)=["']${escapedKey}["'][^>]*content=["']([^"']*)["']`, 'i'),
new RegExp(`<meta[^>]*content=["']([^"']*)["'][^>]*(?:property|name)=["']${escapedKey}["']`, 'i')
]
for (const pattern of patterns) {
const match = html.match(pattern)
if (match) {
return decodeHtmlEntities(match[1].trim())
}
}
return ''
}
/**
* Extracts the article publish timestamp from meta tags or the Naver
* `data-date-time` attribute on the article page.
*
* @param {string} html
* @returns {string}
*/
_extractArticleDateTime(html) {
const metaDate = this._extractMeta(html, 'article:published_time')
if (metaDate) {
return metaDate
}
const attrMatch = html.match(/data-date-time=["']([^"']+)["']/i)
return attrMatch ? attrMatch[1].trim() : ''
}
/**
* Normalizes a Naver press name such as "매일경제 | 네이버" to just the press
* name, matching the list-API `officeName` format.
*
* @param {string} name
* @returns {string}
*/
_cleanSourceName(name) {
return String(name || '').split('|')[0].trim()
}
/**
* @param {string} html
* @param {string} tagName
@@ -425,9 +552,33 @@ function normalizePublishedAt(value) {
const second = value.slice(12, 14)
return `${year}-${month}-${day}T${hour}:${minute}:${second}+09:00`
}
const spacedMatch = value.match(/^(\d{4}-\d{2}-\d{2})[ T](\d{2}:\d{2}:\d{2})$/)
if (spacedMatch) {
return `${spacedMatch[1]}T${spacedMatch[2]}+09:00`
}
return value
}
function decodeHtmlEntities(value) {
return String(value || '')
.replace(/&#x([0-9a-f]+);/gi, (_match, hex) => safeFromCodePoint(parseInt(hex, 16)))
.replace(/&#(\d+);/g, (_match, dec) => safeFromCodePoint(parseInt(dec, 10)))
.replace(/&quot;/g, '"')
.replace(/&apos;/g, "'")
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&nbsp;/g, ' ')
.replace(/&amp;/g, '&')
}
function safeFromCodePoint(code) {
try {
return String.fromCodePoint(code)
} catch (_error) {
return ''
}
}
function escapeRegExp(value) {
return String(value).replace(/[.*+?^${}()|[\]\\]/g, '\\$&')
}

View File

@@ -4,10 +4,11 @@
* Coordinates category validation, refresh logic, and repository access.
*/
class NewsStorageService {
constructor(categoryCatalog, repository, apiClient) {
constructor(categoryCatalog, repository, apiClient, pinnedArticleService = null) {
this._categoryCatalog = categoryCatalog
this._repository = repository
this._apiClient = apiClient
this._pinnedArticleService = pinnedArticleService
}
/**
@@ -39,7 +40,8 @@ class NewsStorageService {
async getCategoryNews(categoryKey, limit) {
const category = this._resolveCategory(categoryKey)
const document = await this._repository.readCategory(category)
return this._applyLimit(document, limit)
const pinnedDocument = await this._applyPinned(category, document)
return this._applyLimit(pinnedDocument, limit)
}
/**
@@ -51,17 +53,19 @@ class NewsStorageService {
async getCategoryNewsPage(categoryKey, page, pageSize) {
const category = this._resolveCategory(categoryKey)
const document = await this._repository.readCategory(category)
const pinnedDocument = await this._applyPinned(category, document)
const articles = pinnedDocument.articles
const safePage = Number.isInteger(page) && page > 0 ? page : 1
const safePageSize = Number.isInteger(pageSize) && pageSize > 0 ? pageSize : document.articles.length
const safePageSize = Number.isInteger(pageSize) && pageSize > 0 ? pageSize : articles.length
const start = (safePage - 1) * safePageSize
const end = start + safePageSize
return {
...document,
total: document.total,
...pinnedDocument,
total: pinnedDocument.total,
page: safePage,
pageSize: safePageSize,
articles: document.articles.slice(start, end)
articles: articles.slice(start, end)
}
}
@@ -103,6 +107,36 @@ class NewsStorageService {
return category
}
/**
* Prepends the configured pinned articles to a category document, removing any
* duplicate of a pinned article already present in the fetched list so it only
* appears once (at the top). Returns the document unchanged when no pinning
* service is configured or no pins apply to the category.
*
* @param {import('../config/CategoryCatalog').NewsCategory} category
* @param {object} document
* @returns {Promise<object>}
*/
async _applyPinned(category, document) {
if (!this._pinnedArticleService) {
return document
}
const pinned = await this._pinnedArticleService.getPinnedArticles(category.key)
if (!pinned.length) {
return document
}
const pinnedUrls = new Set(pinned.map((article) => article.url))
const rest = document.articles.filter((article) => !pinnedUrls.has(article.url))
const articles = [...pinned, ...rest]
return {
...document,
total: articles.length,
articles
}
}
/**
* @param {object} document
* @param {number} limit

View File

@@ -0,0 +1,165 @@
const fs = require('fs/promises')
const path = require('path')
/**
* PinnedArticleService
*
* Builds and caches a fixed set of "pinned" articles that should always appear
* at the top of the news list served to the front-end. Articles are built from
* bare Naver URLs via the {@link NewsApiClient} and persisted to disk so the
* (relatively expensive) detail-page fetch is not repeated on every request.
*/
class PinnedArticleService {
/**
* @param {import('./NewsApiClient').NewsApiClient} apiClient
* @param {object} [options]
* @param {string[]} [options.urls] - Naver article URLs to pin, in display order.
* @param {string[] | null} [options.categoryKeys] - Category keys the pins apply to; null/empty means all.
* @param {string} [options.dataDirectory] - Directory used for the persistent cache file.
* @param {number} [options.ttlMs] - How long a built pin stays fresh before a rebuild is attempted.
* @param {() => Date} [options.now] - Clock injection for testability.
*/
constructor(apiClient, options = {}) {
this._apiClient = apiClient
this._urls = Array.isArray(options.urls) ? options.urls.filter(Boolean) : []
this._categoryKeys = Array.isArray(options.categoryKeys) && options.categoryKeys.length > 0
? new Set(options.categoryKeys)
: null
this._ttlMs = Number.isFinite(options.ttlMs) && options.ttlMs > 0 ? options.ttlMs : 6 * 60 * 60 * 1000
this._cachePath = path.resolve(options.dataDirectory || './data', 'pinned-cache.json')
this._now = options.now || (() => new Date())
this._entries = null
this._buildPromise = null
}
/**
* Returns the pinned articles applicable to a category, in display order.
* Articles are (re)built lazily and cached; on build failure any previously
* cached version is reused so the front-end never loses the pin.
*
* @param {string} categoryKey
* @returns {Promise<object[]>}
*/
async getPinnedArticles(categoryKey) {
if (!this._appliesTo(categoryKey)) {
return []
}
const articles = await this._resolveArticles()
return articles.map((article) => ({ ...article }))
}
/**
* @param {string} categoryKey
* @returns {boolean}
*/
_appliesTo(categoryKey) {
if (this._urls.length === 0) {
return false
}
return this._categoryKeys === null || this._categoryKeys.has(categoryKey)
}
/**
* @returns {Promise<object[]>}
*/
async _resolveArticles() {
const entries = await this._load()
if (!this._isStale(entries)) {
return this._orderedArticles(entries)
}
if (!this._buildPromise) {
this._buildPromise = this._rebuild(entries).finally(() => {
this._buildPromise = null
})
}
return this._buildPromise
}
/**
* @param {Record<string, {article: object, builtAt: string}>} previousEntries
* @returns {Promise<object[]>}
*/
async _rebuild(previousEntries) {
const nextEntries = {}
for (const url of this._urls) {
const article = await this._apiClient.buildArticleFromUrl(url)
if (article) {
nextEntries[url] = { article, builtAt: this._now().toISOString() }
} else if (previousEntries[url]) {
// Keep the stale-but-valid version rather than dropping the pin.
nextEntries[url] = previousEntries[url]
}
}
this._entries = nextEntries
await this._persist(nextEntries)
return this._orderedArticles(nextEntries)
}
/**
* @param {Record<string, {article: object, builtAt: string}>} entries
* @returns {object[]}
*/
_orderedArticles(entries) {
return this._urls
.map((url) => entries[url]?.article)
.filter(Boolean)
}
/**
* Stale when any configured pin is missing a cached entry or its newest entry
* has aged past the TTL.
*
* @param {Record<string, {article: object, builtAt: string}>} entries
* @returns {boolean}
*/
_isStale(entries) {
const nowMs = this._now().getTime()
for (const url of this._urls) {
const entry = entries[url]
if (!entry) {
return true
}
const builtMs = Date.parse(entry.builtAt)
if (!Number.isFinite(builtMs) || nowMs - builtMs > this._ttlMs) {
return true
}
}
return false
}
/**
* @returns {Promise<Record<string, {article: object, builtAt: string}>>}
*/
async _load() {
if (this._entries) {
return this._entries
}
try {
const raw = await fs.readFile(this._cachePath, 'utf8')
const parsed = JSON.parse(raw)
this._entries = parsed && typeof parsed === 'object' && !Array.isArray(parsed) ? parsed : {}
} catch (_error) {
this._entries = {}
}
return this._entries
}
/**
* @param {Record<string, {article: object, builtAt: string}>} entries
* @returns {Promise<void>}
*/
async _persist(entries) {
try {
await fs.mkdir(path.dirname(this._cachePath), { recursive: true })
await fs.writeFile(this._cachePath, `${JSON.stringify(entries, null, 2)}\n`)
} catch (_error) {
// Persistence is a best-effort optimization; in-memory cache still serves.
}
}
}
module.exports = {
PinnedArticleService
}