All files / scripts/fetch-calendar/scraper extractors.ts

100% Statements 28/28
71.42% Branches 25/35
100% Functions 11/11
100% Lines 28/28

Press n or j to go to the next uncovered block, b, p or k for the previous block.

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114                                35x         17x 17x         28x 28x 28x         12x 12x               7x 7x       7x 7x                     12x 12x 12x   12x 12x 12x 5x 5x 5x       12x         2x 2x         7x                     18x         14x         12x                
/**
 * @module scripts/fetch-calendar/scraper/extractors
 * @description Low-level HTML extraction primitives used by the
 * Riksdag kalendarium scraper.
 *
 * Each helper is intentionally small and regex-based (no external HTML
 * parser) so they can be fuzz-tested individually against malformed HTML.
 *
 * @author Hack23 AB
 * @license Apache-2.0
 */
 
import { decodeHtmlEntities } from '../../html-utils.js';
 
/** Escape a string for safe use in a `new RegExp(...)` constructor. */
export function escapeRegex(s: string): string {
  return s.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
}
 
/** Extract the `datetime` attribute from a `<time>` element. */
export function extractDatetime(html: string): string | null {
  const m = html.match(/<time\b[^>]*\bdatetime=(["'])(.*?)\1/i);
  return m ? (m[2] ?? null) : null;
}
 
/** Extract a `data-{attr}` attribute value from a tag's attribute string. */
export function extractDataAttr(attrs: string, name: string): string | null {
  const re = new RegExp(`\\bdata-${escapeRegex(name)}\\s*=\\s*(["'])(.*?)\\1`, 'i');
  const m = attrs.match(re);
  return m && m[2]?.trim() ? m[2].trim() : null;
}
 
/** True when an element attribute string contains a `calendar-item` class token. */
export function hasCalendarItemClass(attrs: string): boolean {
  const m = attrs.match(/\bclass\s*=\s*(["'])(.*?)\1/i);
  return m ? (m[2] ?? '').split(/\s+/).includes('calendar-item') : false;
}
 
/**
 * Extract the inner text of a `<span>` whose class contains `{name}`.
 * Uses a simple, non-greedy regex that covers the common markup pattern.
 */
export function extractSpanText(html: string, name: string): string | null {
  const safe = escapeRegex(name);
  const re = new RegExp(
    `<span\\b[^>]*\\bclass\\s*=\\s*(["'])[^"']*${safe}[^"']*\\1[^>]*>([\\s\\S]*?)<\\/span>`,
    'i',
  );
  const m = html.match(re);
  return m ? stripTags(m[2] ?? '').trim() || null : null;
}
 
/**
 * Extract the heading text (h1–h6 or first anchor) and any document
 * reference links from an event block.
 */
export function extractHeadingAndLinks(html: string): {
  summary: string;
  docRefs: string[];
} {
  const headingRe = /<h[1-6]\b[^>]*>([\s\S]*?)<\/h[1-6]>/i;
  const headingMatch = html.match(headingRe);
  const summary = headingMatch ? (headingMatch[1] ?? '') : extractFirstAnchorText(html);
 
  const docRefs: string[] = [];
  const hrefRe = /<a\b[^>]*\bhref=(["'])([^"']+)\1[^>]*>/gi;
  for (const m of html.matchAll(hrefRe)) {
    const href = (m[2] ?? '').trim();
    Eif (isRiksdagDocumentHref(href)) {
      docRefs.push(href);
    }
  }
 
  return { summary, docRefs };
}
 
/** Extract the text of the first `<a>` anchor in an HTML fragment. */
export function extractFirstAnchorText(html: string): string {
  const m = html.match(/<a\b[^>]*>([\s\S]*?)<\/a>/i);
  return m ? (m[1] ?? '') : '';
}
 
/** True when an href looks like a Riksdag document or proceedings link. */
export function isRiksdagDocumentHref(href: string): boolean {
  return (
    href.includes('/dokument') ||
    href.includes('/betankanden') ||
    href.includes('/propositioner') ||
    href.includes('/motioner') ||
    href.includes('/interpellationer')
  );
}
 
/** Remove all HTML tags from a string. */
export function stripTags(html: string): string {
  return html.replace(/<[^>]+>/g, ' ').replace(/\s+/g, ' ');
}
 
/** Normalize committee/organ codes by collapsing whitespace and trimming only. */
export function normalizeOrgCode(raw: string): string {
  return raw.replace(/\s+/g, ' ').trim();
}
 
/** Normalize activity type strings to lower-case-with-hyphens. */
export function normalizeAkt(raw: string): string {
  return raw
    .toLowerCase()
    .replace(/\s+/g, '-')
    .replace(/[^a-z0-9-åäö]/g, '')
    .trim();
}
 
export { decodeHtmlEntities };