Press n or j to go to the next uncovered block, b, p or k for the previous block.
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 | 30x 30x 30x 157846x 157846x 157846x 157846x 157846x 157846x 157846x 157846x 157846x 157846x 157846x 157846x 157768x 157768x 598x 157846x 473538x 316004x 157846x 157846x 157846x 157846x 26x 26x 156x 156x 158990x 158990x 156x 130x 158834x 157898x 157898x 157846x 157846x 157846x 157846x 157846x 157898x 364x 157846x 26x 26x 364x 159094x 123370x 26x | /**
* @module Infrastructure/SitemapHtml/Articles/Scanner
* @category Intelligence Operations / Supporting Infrastructure
* @name News article scanner — recursive, language-grouped
*
* @description
* Walks `news/` recursively, parses metadata from every
* `<slug>-<lang>.html` file (skipping index pages and metadata folders),
* and groups results by language. Articles are sorted newest-first by
* filename date prefix with filename as deterministic tiebreaker.
*
* Round-6 split: extracted from `scripts/generate-sitemap-html.ts`.
*
* @author Hack23 AB (Infrastructure Team)
* @license Apache-2.0
*/
import fs from 'fs';
import path from 'path';
import { fileURLToPath } from 'url';
import type { Language } from '../../types/language.js';
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
const NEWS_DIR = path.join(__dirname, '..', '..', '..', 'news');
export interface ArticleInfo {
file: string;
title: string;
description: string;
lang: Language;
baseSlug: string;
/** Extracted publication date (YYYY-MM-DD) parsed from filename prefix, empty string if absent. */
date: string;
}
/**
* Extract a leading ISO date (YYYY-MM-DD) from a news article filename.
* Returns an empty string when the filename does not start with a date,
* which keeps those articles at the bottom of date-sorted lists.
*/
export function extractArticleDate(fileName: string): string {
const match = fileName.match(/^(\d{4}-\d{2}-\d{2})-/);
return match ? match[1]! : '';
}
// ---------------------------------------------------------------------------
// Functions
// ---------------------------------------------------------------------------
/**
* Extract title and description from an HTML file.
*
* Per `seo-metadata-contract.md` §3.h, prefers the richest available
* description: `og:description` → `<meta name="description">` → JSON-LD
* `description`, picking whichever is longest. The title is preferred
* from `og:title` (with any trailing ` — Riksdagsmonitor` brand suffix
* stripped) before falling back to `<title>`.
*/
export function extractArticleMeta(filePath: string): { title: string; description: string } {
try {
const content = fs.readFileSync(filePath, 'utf8');
const ogTitleMatch = content.match(/<meta\s+property="og:title"\s+content="([^"]+)"/i);
const titleMatch = content.match(/<title>([^<]+)<\/title>/i);
const ogDescMatch = content.match(/<meta\s+property="og:description"\s+content="([^"]+)"/i);
const descMatch = content.match(/<meta\s+name="description"\s+content="([^"]+)"/i);
const jsonLdDesc = (() => {
try {
const m = content.match(/<script type="application\/ld\+json">([\s\S]*?)<\/script>/i);
if (!m) return null;
const parsed = JSON.parse(m[1]!.trim()) as { description?: string };
return typeof parsed.description === 'string' && parsed.description.trim().length > 0
? parsed.description.trim()
: null;
} catch {
return null;
}
})();
const candidates = [ogDescMatch?.[1]?.trim(), descMatch?.[1]?.trim(), jsonLdDesc]
.filter((s): s is string => !!s && s.length > 0)
.sort((a, b) => b.length - a.length);
const description = candidates[0] ?? '';
const rawTitle = (ogTitleMatch?.[1] ?? titleMatch?.[1] ?? '').trim();
const title = rawTitle.replace(/\s*[—\-|]\s*Riksdagsmonitor\s*$/i, '').trim();
return {
title: title.length > 0 ? title : path.basename(filePath, '.html'),
description,
};
} catch (_error: unknown) {
return { title: path.basename(filePath, '.html'), description: '' };
}
}
/**
* Scan news articles and group by language.
*
* Articles are sorted by their filename date prefix (YYYY-MM-DD) in descending
* order so the most recent articles appear first on the sitemap. The news
* directory is walked recursively so articles under date-partitioned
* subdirectories (e.g. `news/2026/02/2026-02-13-article-en.html`) are
* also included.
*/
export function getArticlesByLanguage(): Map<Language, ArticleInfo[]> {
const articlesByLang = new Map<Language, ArticleInfo[]>();
Iif (!fs.existsSync(NEWS_DIR)) return articlesByLang;
function scanDir(dir: string): void {
const entries = fs.readdirSync(dir, { withFileTypes: true });
for (const entry of entries) {
const fullPath = path.join(dir, entry.name);
if (entry.isDirectory()) {
if (entry.name === 'metadata' || entry.name.startsWith('.')) continue;
scanDir(fullPath);
} else if (
entry.isFile() &&
entry.name.endsWith('.html') &&
entry.name !== 'index.html' &&
!entry.name.startsWith('index_')
) {
const match = entry.name.match(/^(.+?)-(en|sv|da|no|fi|de|fr|es|nl|ar|he|ja|ko|zh)\.html$/);
if (!match) continue;
const baseSlug = match[1]!;
const lang = match[2]! as Language;
const meta = extractArticleMeta(fullPath);
const relDir = path.relative(NEWS_DIR, dir).split(path.sep).join('/');
const hrefFile = relDir ? `${relDir}/${entry.name}` : entry.name;
if (!articlesByLang.has(lang)) {
articlesByLang.set(lang, []);
}
articlesByLang.get(lang)!.push({
file: hrefFile,
title: meta.title,
description: meta.description,
lang,
baseSlug,
date: extractArticleDate(entry.name),
});
}
}
}
scanDir(NEWS_DIR);
for (const [, list] of articlesByLang) {
list.sort((a, b) => {
if (a.date !== b.date) return b.date.localeCompare(a.date);
return b.file.localeCompare(a.file);
});
}
return articlesByLang;
}
|