diff --git a/.env.example b/.env.example index ce11921..6c3dfab 100644 --- a/.env.example +++ b/.env.example @@ -2,3 +2,12 @@ PORT=3100 NEWS_PAGE_SIZE=20 NEWS_REFRESH_CRON=0 * * * * DATA_DIR=./data + +# Pinned articles: always returned first, in order, to the front-end. +# Comma-separated Naver article URLs. +NEWS_PINNED_ARTICLE_URLS=https://n.news.naver.com/mnews/article/009/0005689327 +# Categories the pins apply to (comma-separated keys: flashnews,mainnews,ranknews,worldnews +# or front-end aliases finance,business,technology,market). Leave empty for all categories. +NEWS_PINNED_CATEGORIES= +# How long (ms) a built pin stays cached before a rebuild is attempted. Default 6h. +NEWS_PINNED_TTL_MS=21600000 diff --git a/src/app/NewsApplication.js b/src/app/NewsApplication.js index 7b4a611..22e9092 100644 --- a/src/app/NewsApplication.js +++ b/src/app/NewsApplication.js @@ -3,6 +3,7 @@ const path = require('path') const { CategoryNewsRepository } = require('../repositories/CategoryNewsRepository') const { NewsApiClient } = require('../services/NewsApiClient') const { NewsStorageService } = require('../services/NewsStorageService') +const { PinnedArticleService } = require('../services/PinnedArticleService') const { NewsRefreshScheduler } = require('../services/NewsRefreshScheduler') const { NewsController } = require('../controllers/NewsController') const { NewsApiCompatibilityController } = require('../controllers/NewsApiCompatibilityController') @@ -24,7 +25,18 @@ class NewsApplication { pageSize: runtimeConfig.pageSize, dataDirectory: runtimeConfig.dataDirectory }) - this._storageService = new NewsStorageService(this._categoryCatalog, this._repository, this._apiClient) + this._pinnedArticleService = new PinnedArticleService(this._apiClient, { + urls: runtimeConfig.pinnedArticleUrls, + categoryKeys: runtimeConfig.pinnedCategoryKeys, + dataDirectory: runtimeConfig.dataDirectory, + ttlMs: runtimeConfig.pinnedTtlMs + }) + this._storageService = new NewsStorageService( + this._categoryCatalog, + this._repository, + this._apiClient, + this._pinnedArticleService + ) this._scheduler = new NewsRefreshScheduler(this._storageService, runtimeConfig.refreshCron, this._logger) this._express = express() this._server = null diff --git a/src/config/ServiceConfig.js b/src/config/ServiceConfig.js index e9f33e4..f54492e 100644 --- a/src/config/ServiceConfig.js +++ b/src/config/ServiceConfig.js @@ -11,16 +11,38 @@ class ServiceConfig { } /** - * @returns {{port: number, pageSize: number, refreshCron: string, dataDirectory: string}} + * @returns {{port: number, pageSize: number, refreshCron: string, dataDirectory: string, pinnedArticleUrls: string[], pinnedCategoryKeys: string[] | null, pinnedTtlMs: number}} */ toRuntimeConfig() { return { port: Number(this._env.PORT || 3100), pageSize: Number(this._env.NEWS_PAGE_SIZE || 20), refreshCron: this._env.NEWS_REFRESH_CRON || '0 * * * *', - dataDirectory: path.resolve(process.cwd(), this._env.DATA_DIR || './data') + dataDirectory: path.resolve(process.cwd(), this._env.DATA_DIR || './data'), + pinnedArticleUrls: this._parseList(this._env.NEWS_PINNED_ARTICLE_URLS, [ + 'https://n.news.naver.com/mnews/article/009/0005689327' + ]), + pinnedCategoryKeys: this._parseList(this._env.NEWS_PINNED_CATEGORIES, null), + pinnedTtlMs: Number(this._env.NEWS_PINNED_TTL_MS || 6 * 60 * 60 * 1000) } } + + /** + * Parses a comma-separated environment value into a trimmed list. + * + * @param {string | undefined} value + * @param {string[] | null} fallback - Returned when the value is unset. + * @returns {string[] | null} + */ + _parseList(value, fallback) { + if (value === undefined || value === null || value.trim() === '') { + return fallback + } + return value + .split(',') + .map((item) => item.trim()) + .filter(Boolean) + } } module.exports = { diff --git a/src/services/NewsApiClient.js b/src/services/NewsApiClient.js index d09a429..88efde8 100644 --- a/src/services/NewsApiClient.js +++ b/src/services/NewsApiClient.js @@ -188,17 +188,144 @@ class NewsApiClient { return '' } const html = await response.text() - const content = worldNews - ? this._extractElementHtmlById(html, 'content') - : this._extractTagHtml(html, 'article') - - const normalizedContent = await this._replaceContentImages(content || '', detailUrl) + const normalizedContent = await this._extractContentFromHtml(html, detailUrl, worldNews) if (normalizedContent) { await this._detailCache.set(detailUrl, normalizedContent) } return normalizedContent } + /** + * Extracts the article body from already-downloaded detail HTML and rewrites + * its inline images to locally-proxied URLs. + * + * @param {string} html + * @param {string} detailUrl + * @param {boolean} worldNews + * @returns {Promise} + */ + async _extractContentFromHtml(html, detailUrl, worldNews) { + const content = worldNews + ? this._extractElementHtmlById(html, 'content') + : this._extractTagHtml(html, 'article') + return this._replaceContentImages(content || '', detailUrl) + } + + /** + * Builds a normalized article object from a bare Naver article URL. + * + * Unlike {@link fetchArticlesByCategory}, no list payload is available, so the + * article metadata (title, source, publish time, image) is parsed from the + * detail page's Open Graph / meta tags. Used to pin a specific article. + * + * @param {string} articleUrl + * @returns {Promise} + */ + async buildArticleFromUrl(articleUrl) { + const sourceId = this._extractSourceIdFromUrl(articleUrl) + if (!sourceId) { + return null + } + + const detailUrl = `${NAVER_ARTICLE_URL}${sourceId}` + let response + try { + response = await this._fetch(detailUrl) + } catch (_error) { + return null + } + if (!response.ok) { + return null + } + + const html = await response.text() + const content = await this._extractContentFromHtml(html, detailUrl, false) + const title = this._extractMeta(html, 'og:title') + if (!title) { + return null + } + + return { + source: { + id: sourceId.split('/')[0] || null, + name: this._cleanSourceName( + this._extractMeta(html, 'og:article:author') + || this._extractMeta(html, 'twitter:creator') + || this._extractMeta(html, 'og:site_name') + ) || null + }, + author: null, + title, + description: this._extractMeta(html, 'og:description') || this._extractMeta(html, 'description') || '', + url: detailUrl, + urlToImage: this._extractMeta(html, 'og:image') || '', + publishedAt: normalizePublishedAt(this._extractArticleDateTime(html)) || null, + content + } + } + + /** + * Parses the `{officeId}/{articleId}` source id from any Naver article URL, + * tolerating the `/article/`, `/mnews/article/` and query-string variants. + * + * @param {string} articleUrl + * @returns {string} + */ + _extractSourceIdFromUrl(articleUrl) { + const match = String(articleUrl || '').match(/article\/(\d+)\/(\d+)/) + return match ? `${match[1]}/${match[2]}` : '' + } + + /** + * Reads a `` content value by `property`/`name`, regardless of attribute + * order, and decodes HTML entities. + * + * @param {string} html + * @param {string} key + * @returns {string} + */ + _extractMeta(html, key) { + const escapedKey = escapeRegExp(key) + const patterns = [ + new RegExp(`]*(?:property|name)=["']${escapedKey}["'][^>]*content=["']([^"']*)["']`, 'i'), + new RegExp(`]*content=["']([^"']*)["'][^>]*(?:property|name)=["']${escapedKey}["']`, 'i') + ] + for (const pattern of patterns) { + const match = html.match(pattern) + if (match) { + return decodeHtmlEntities(match[1].trim()) + } + } + return '' + } + + /** + * Extracts the article publish timestamp from meta tags or the Naver + * `data-date-time` attribute on the article page. + * + * @param {string} html + * @returns {string} + */ + _extractArticleDateTime(html) { + const metaDate = this._extractMeta(html, 'article:published_time') + if (metaDate) { + return metaDate + } + const attrMatch = html.match(/data-date-time=["']([^"']+)["']/i) + return attrMatch ? attrMatch[1].trim() : '' + } + + /** + * Normalizes a Naver press name such as "매일경제 | 네이버" to just the press + * name, matching the list-API `officeName` format. + * + * @param {string} name + * @returns {string} + */ + _cleanSourceName(name) { + return String(name || '').split('|')[0].trim() + } + /** * @param {string} html * @param {string} tagName @@ -425,9 +552,33 @@ function normalizePublishedAt(value) { const second = value.slice(12, 14) return `${year}-${month}-${day}T${hour}:${minute}:${second}+09:00` } + const spacedMatch = value.match(/^(\d{4}-\d{2}-\d{2})[ T](\d{2}:\d{2}:\d{2})$/) + if (spacedMatch) { + return `${spacedMatch[1]}T${spacedMatch[2]}+09:00` + } return value } +function decodeHtmlEntities(value) { + return String(value || '') + .replace(/&#x([0-9a-f]+);/gi, (_match, hex) => safeFromCodePoint(parseInt(hex, 16))) + .replace(/&#(\d+);/g, (_match, dec) => safeFromCodePoint(parseInt(dec, 10))) + .replace(/"/g, '"') + .replace(/'/g, "'") + .replace(/</g, '<') + .replace(/>/g, '>') + .replace(/ /g, ' ') + .replace(/&/g, '&') +} + +function safeFromCodePoint(code) { + try { + return String.fromCodePoint(code) + } catch (_error) { + return '' + } +} + function escapeRegExp(value) { return String(value).replace(/[.*+?^${}()|[\]\\]/g, '\\$&') } diff --git a/src/services/NewsStorageService.js b/src/services/NewsStorageService.js index ca82b73..69bf1b2 100644 --- a/src/services/NewsStorageService.js +++ b/src/services/NewsStorageService.js @@ -4,10 +4,11 @@ * Coordinates category validation, refresh logic, and repository access. */ class NewsStorageService { - constructor(categoryCatalog, repository, apiClient) { + constructor(categoryCatalog, repository, apiClient, pinnedArticleService = null) { this._categoryCatalog = categoryCatalog this._repository = repository this._apiClient = apiClient + this._pinnedArticleService = pinnedArticleService } /** @@ -39,7 +40,8 @@ class NewsStorageService { async getCategoryNews(categoryKey, limit) { const category = this._resolveCategory(categoryKey) const document = await this._repository.readCategory(category) - return this._applyLimit(document, limit) + const pinnedDocument = await this._applyPinned(category, document) + return this._applyLimit(pinnedDocument, limit) } /** @@ -51,17 +53,19 @@ class NewsStorageService { async getCategoryNewsPage(categoryKey, page, pageSize) { const category = this._resolveCategory(categoryKey) const document = await this._repository.readCategory(category) + const pinnedDocument = await this._applyPinned(category, document) + const articles = pinnedDocument.articles const safePage = Number.isInteger(page) && page > 0 ? page : 1 - const safePageSize = Number.isInteger(pageSize) && pageSize > 0 ? pageSize : document.articles.length + const safePageSize = Number.isInteger(pageSize) && pageSize > 0 ? pageSize : articles.length const start = (safePage - 1) * safePageSize const end = start + safePageSize return { - ...document, - total: document.total, + ...pinnedDocument, + total: pinnedDocument.total, page: safePage, pageSize: safePageSize, - articles: document.articles.slice(start, end) + articles: articles.slice(start, end) } } @@ -103,6 +107,36 @@ class NewsStorageService { return category } + /** + * Prepends the configured pinned articles to a category document, removing any + * duplicate of a pinned article already present in the fetched list so it only + * appears once (at the top). Returns the document unchanged when no pinning + * service is configured or no pins apply to the category. + * + * @param {import('../config/CategoryCatalog').NewsCategory} category + * @param {object} document + * @returns {Promise} + */ + async _applyPinned(category, document) { + if (!this._pinnedArticleService) { + return document + } + const pinned = await this._pinnedArticleService.getPinnedArticles(category.key) + if (!pinned.length) { + return document + } + + const pinnedUrls = new Set(pinned.map((article) => article.url)) + const rest = document.articles.filter((article) => !pinnedUrls.has(article.url)) + const articles = [...pinned, ...rest] + + return { + ...document, + total: articles.length, + articles + } + } + /** * @param {object} document * @param {number} limit diff --git a/src/services/PinnedArticleService.js b/src/services/PinnedArticleService.js new file mode 100644 index 0000000..0956b4f --- /dev/null +++ b/src/services/PinnedArticleService.js @@ -0,0 +1,165 @@ +const fs = require('fs/promises') +const path = require('path') + +/** + * PinnedArticleService + * + * Builds and caches a fixed set of "pinned" articles that should always appear + * at the top of the news list served to the front-end. Articles are built from + * bare Naver URLs via the {@link NewsApiClient} and persisted to disk so the + * (relatively expensive) detail-page fetch is not repeated on every request. + */ +class PinnedArticleService { + /** + * @param {import('./NewsApiClient').NewsApiClient} apiClient + * @param {object} [options] + * @param {string[]} [options.urls] - Naver article URLs to pin, in display order. + * @param {string[] | null} [options.categoryKeys] - Category keys the pins apply to; null/empty means all. + * @param {string} [options.dataDirectory] - Directory used for the persistent cache file. + * @param {number} [options.ttlMs] - How long a built pin stays fresh before a rebuild is attempted. + * @param {() => Date} [options.now] - Clock injection for testability. + */ + constructor(apiClient, options = {}) { + this._apiClient = apiClient + this._urls = Array.isArray(options.urls) ? options.urls.filter(Boolean) : [] + this._categoryKeys = Array.isArray(options.categoryKeys) && options.categoryKeys.length > 0 + ? new Set(options.categoryKeys) + : null + this._ttlMs = Number.isFinite(options.ttlMs) && options.ttlMs > 0 ? options.ttlMs : 6 * 60 * 60 * 1000 + this._cachePath = path.resolve(options.dataDirectory || './data', 'pinned-cache.json') + this._now = options.now || (() => new Date()) + this._entries = null + this._buildPromise = null + } + + /** + * Returns the pinned articles applicable to a category, in display order. + * Articles are (re)built lazily and cached; on build failure any previously + * cached version is reused so the front-end never loses the pin. + * + * @param {string} categoryKey + * @returns {Promise} + */ + async getPinnedArticles(categoryKey) { + if (!this._appliesTo(categoryKey)) { + return [] + } + const articles = await this._resolveArticles() + return articles.map((article) => ({ ...article })) + } + + /** + * @param {string} categoryKey + * @returns {boolean} + */ + _appliesTo(categoryKey) { + if (this._urls.length === 0) { + return false + } + return this._categoryKeys === null || this._categoryKeys.has(categoryKey) + } + + /** + * @returns {Promise} + */ + async _resolveArticles() { + const entries = await this._load() + if (!this._isStale(entries)) { + return this._orderedArticles(entries) + } + + if (!this._buildPromise) { + this._buildPromise = this._rebuild(entries).finally(() => { + this._buildPromise = null + }) + } + return this._buildPromise + } + + /** + * @param {Record} previousEntries + * @returns {Promise} + */ + async _rebuild(previousEntries) { + const nextEntries = {} + for (const url of this._urls) { + const article = await this._apiClient.buildArticleFromUrl(url) + if (article) { + nextEntries[url] = { article, builtAt: this._now().toISOString() } + } else if (previousEntries[url]) { + // Keep the stale-but-valid version rather than dropping the pin. + nextEntries[url] = previousEntries[url] + } + } + + this._entries = nextEntries + await this._persist(nextEntries) + return this._orderedArticles(nextEntries) + } + + /** + * @param {Record} entries + * @returns {object[]} + */ + _orderedArticles(entries) { + return this._urls + .map((url) => entries[url]?.article) + .filter(Boolean) + } + + /** + * Stale when any configured pin is missing a cached entry or its newest entry + * has aged past the TTL. + * + * @param {Record} entries + * @returns {boolean} + */ + _isStale(entries) { + const nowMs = this._now().getTime() + for (const url of this._urls) { + const entry = entries[url] + if (!entry) { + return true + } + const builtMs = Date.parse(entry.builtAt) + if (!Number.isFinite(builtMs) || nowMs - builtMs > this._ttlMs) { + return true + } + } + return false + } + + /** + * @returns {Promise>} + */ + async _load() { + if (this._entries) { + return this._entries + } + try { + const raw = await fs.readFile(this._cachePath, 'utf8') + const parsed = JSON.parse(raw) + this._entries = parsed && typeof parsed === 'object' && !Array.isArray(parsed) ? parsed : {} + } catch (_error) { + this._entries = {} + } + return this._entries + } + + /** + * @param {Record} entries + * @returns {Promise} + */ + async _persist(entries) { + try { + await fs.mkdir(path.dirname(this._cachePath), { recursive: true }) + await fs.writeFile(this._cachePath, `${JSON.stringify(entries, null, 2)}\n`) + } catch (_error) { + // Persistence is a best-effort optimization; in-memory cache still serves. + } + } +} + +module.exports = { + PinnedArticleService +}