base:url cleanup

2025-03-29 11:21:11 +01:00 · 2025-03-29 11:21:11 +01:00 · f01f52ca45
commit f01f52ca45
parent ed0d8b60f2
9 changed files with 7 additions and 557 deletions
--- a/package.json
+++ b/package.json
@ -24,7 +24,7 @@
    "test:model": "vitest run src/model",
    "test:model:watch": "vitest watch src/model",
    "test:watch": "node --experimental-vm-modules node_modules/jest/bin/jest.js --watch",
-    "test:url": "vitest run src/base/url.test.ts src/base/link-preview.test.ts"
+    "test:url": "vitest run src/base/url.test.ts"
  },
  "dependencies": {
    "@astrojs/compiler": "^2.10.4",
--- a/src/base/tests/mocks/url.js
+++ b/src/base/tests/mocks/url.js
@ -1,9 +0,0 @@
-import { jest } from '@jest/globals';
-
-export const meta = jest.fn().mockResolvedValue({
-  title: 'Test Title',
-  description: 'Test Description',
-  image: 'https://example.com/image.jpg',
-  favicon: 'https://example.com/favicon.ico',
-  siteName: 'Example Site'
-}); 
--- a/src/base/tests/url-cache.test.ts
+++ b/src/base/tests/url-cache.test.ts
@ -1,105 +0,0 @@
-import { jest } from '@jest/globals';
-import { urlCache } from '../url-cache.js';
-import { validateUrl } from '../../model/filters.js';
-import fs from 'fs/promises';
-import path from 'path';
-
-jest.mock('../url.js');
-
-// Mock fetch
-const mockFetch = jest.fn().mockImplementation(
-  (): Promise<Response> =>
-    Promise.resolve(new Response(null, {
-      status: 200,
-      statusText: 'OK'
-    }))
-);
-(global as any).fetch = mockFetch;
-
-const testMeta = {
-  title: 'Test Title',
-  description: 'Test Description',
-  image: 'https://example.com/image.jpg',
-  favicon: 'https://example.com/favicon.ico',
-  siteName: 'Example Site'
-};
-
-describe('UrlCache', () => {
-  const testUrl = 'https://example.com';
-
-  beforeEach(async () => {
-    // Clear cache before each test
-    await urlCache.clear();
-    // Reset fetch mock
-    mockFetch.mockClear();
-  });
-
-  afterAll(async () => {
-    // Clean up after all tests
-    await urlCache.clear();
-  });
-
-  test('should store and retrieve URL validity', async () => {
-    await urlCache.set(testUrl, true);
-    const result = await urlCache.get(testUrl);
-    expect(result).toBeTruthy();
-    expect(result?.isValid).toBe(true);
-  });
-
-  test('should store and retrieve meta information', async () => {
-    await urlCache.set(testUrl, true, testMeta);
-    const result = await urlCache.get(testUrl);
-    expect(result?.meta).toEqual(testMeta);
-  });
-
-  test('should handle invalid URLs', async () => {
-    await urlCache.set(testUrl, false);
-    const result = await urlCache.get(testUrl);
-    expect(result?.isValid).toBe(false);
-  });
-
-  test('should expire cache entries', async () => {
-    // Set a URL with a very old timestamp
-    const oldEntry = {
-      isValid: true,
-      timestamp: Date.now() - (8 * 24 * 60 * 60 * 1000), // 8 days old
-      meta: testMeta
-    };
-    
-    const cacheFile = path.join(process.cwd(), '.cache', 'url-cache.json');
-    await fs.writeFile(cacheFile, JSON.stringify({ [testUrl]: oldEntry }));
-
-    const result = await urlCache.get(testUrl);
-    expect(result).toBeNull();
-  });
-
-  test('validateUrl should store meta information', async () => {
-    const isValid = await validateUrl(testUrl);
-    expect(isValid).toBe(true);
-    expect(mockFetch).toHaveBeenCalledWith(
-      testUrl,
-      expect.objectContaining({
-        signal: expect.any(AbortSignal),
-        redirect: 'follow'
-      })
-    );
-
-    const result = await urlCache.get(testUrl);
-    expect(result?.isValid).toBe(true);
-    expect(result?.meta).toEqual(testMeta);
-  });
-
-  test('expandUrls should add meta information to valid URLs without meta', async () => {
-    // Add a URL without meta info
-    await urlCache.set(testUrl, true);
-    let result = await urlCache.get(testUrl);
-    expect(result?.meta).toBeUndefined();
-
-    // Expand URLs
-    await urlCache.expandUrls();
-
-    // Check that meta info was added
-    result = await urlCache.get(testUrl);
-    expect(result?.meta).toEqual(testMeta);
-  });
-}); 
--- a/src/components/howtos/Detail.astro
+++ b/src/components/howtos/Detail.astro
@ -14,7 +14,7 @@ import { sync as exists } from "@polymech/fs/exists";
 import { sync as read } from "@polymech/fs/read";
 import { createMarkdownComponent } from "@/base/index.js";
 import { translate } from "@/base/i18n.js";
-import { applyFilters, shortenUrl } from "@/model/howto.js";
+import { applyFilters, shortenUrl } from "@/base/filters.js";
 import {
  HOWTO_FILES_WEB,
  HOWTO_FILES_ABS,
--- a/src/components/howtos/Detail2.astro
+++ b/src/components/howtos/Detail2.astro
@ -14,7 +14,7 @@ import { sync as exists } from "@polymech/fs/exists";
 import { sync as read } from "@polymech/fs/read";
 import { createHTMLComponent, createMarkdownComponent } from "@/base/index.js";
 import { translate } from "@/base/i18n.js";
-import { applyFilters, shortenUrl } from "@/model/howto.js";
+import { applyFilters, shortenUrl } from "@/base/filters.js";
 // import { extract, extract_learned_skills, references } from "@/base/kbot.js";
 import {
  HOWTO_FILES_WEB,
--- a/src/model/filters.test.ts
+++ b/src/model/filters.test.ts
@ -1,166 +0,0 @@
-import './test-setup.js';
-import { describe, it, expect } from 'vitest';
-import {
-  shortenUrl,
-  renderLinks,
-  filterBannedPhrases,
-  replaceWords,
-  applyFilters,
-  default_filters_plain,
-  default_filters_markdown,
-  item_path,
-  validateLinks
-} from './filters.js';
-
-describe('filters', () => {
-  describe('item_path', () => {
-    it('should generate correct path from item', () => {
-      const item = { data: { slug: 'test-slug' } };
-      expect(item_path(item)).toBe('/howto/test-slug');
-    });
-  });
-
-  describe('shortenUrl', () => {
-    it('should remove www. prefix and trailing slashes', () => {
-      expect(shortenUrl('https://www.example.com/path/')).toBe('example.com/path');
-    });
-
-    it('should handle URLs without www. prefix', () => {
-      expect(shortenUrl('https://example.com/path')).toBe('example.com/path');
-    });
-
-    it('should handle invalid URLs gracefully', () => {
-      expect(shortenUrl('invalid-url')).toBe('invalid-url');
-    });
-
-    it('should handle URLs with query parameters', () => {
-      expect(shortenUrl('https://example.com/path?param=value')).toBe('example.com/path?param=value');
-    });
-  });
-
-  describe('renderLinks', () => {
-    it('should render non-blacklisted links', () => {
-      const input = 'Check out https://example.com';
-      const expected = 'Check out <a class="text-orange-600 underline" href="https://example.com" target="_blank" rel="noopener noreferrer">example.com</a>';
-      expect(renderLinks(input)).toBe(expected);
-    });
-
-    it('should replace blacklisted links with empty string', () => {
-      const input = 'Check out https://preciousplastic.com';
-      expect(renderLinks(input)).toBe('Check out ');
-    });
-
-    it('should handle multiple links in text', () => {
-      const input = 'Check out https://example.com and https://preciousplastic.com';
-      const result = renderLinks(input);
-      expect(result).toContain('example.com');
-      expect(result).toContain('and ');
-    });
-  });
-
-  describe('filterBannedPhrases', () => {
-    it('should replace banned words with [filtered]', () => {
-      const input = 'The wizard used magic2';
-      const expected = 'The [filtered] used [filtered]';
-      expect(filterBannedPhrases(input)).toBe(expected);
-    });
-
-    it('should handle case-insensitive matching', () => {
-      const input = 'The WIZARD used MAGIC2';
-      const expected = 'The [filtered] used [filtered]';
-      expect(filterBannedPhrases(input)).toBe(expected);
-    });
-
-    it('should not replace partial matches', () => {
-      const input = 'The wizardry used magic2.0';
-      const expected = 'The wizardry used [filtered].0';
-      expect(filterBannedPhrases(input)).toBe(expected);
-    });
-  });
-
-  describe('replaceWords', () => {
-    it('should replace words according to wordReplaceMap', () => {
-      const input = 'I need a Router for my Car';
-      const expected = 'I need a CNC Router for my tufftuff';
-      expect(replaceWords(input)).toBe(expected);
-    });
-
-    it('should handle multi-word replacements', () => {
-      const input = 'I need a laptop stand';
-      expect(replaceWords(input)).toBe('I need a laptoppie');
-    });
-
-    it('should handle case-insensitive matching', () => {
-      const input = 'I need a ROUTER for my CAR';
-      const expected = 'I need a CNC Router for my tufftuff';
-      expect(replaceWords(input)).toBe(expected);
-    });
-  });
-
-  describe('applyFilters', () => {
-    it('should apply plain text filters in sequence', async () => {
-      const input = 'Check out https://example.com with the wizard Router';
-      const result = await applyFilters(input, default_filters_plain);
-      expect(result).toContain('example.com');
-      expect(result).toContain('[filtered]');
-      expect(result).toContain('CNC Router');
-    });
-
-    it('should apply markdown filters in sequence', async () => {
-      const input = 'Check out [example](https://example.com) with the wizard Router';
-      const result = await applyFilters(input, default_filters_markdown);
-      expect(result).toContain('example');
-      expect(result).toContain('[filtered]');
-      expect(result).toContain('CNC Router');
-    });
-
-    it('should handle empty input', async () => {
-      expect(await applyFilters('')).toBe('');
-    });
-
-    it('should handle custom filter array', async () => {
-      const customFilters = [filterBannedPhrases];
-      const input = 'The wizard used magic2';
-      const expected = 'The [filtered] used [filtered]';
-      expect(await applyFilters(input, customFilters)).toBe(expected);
-    });
-
-    it('should handle markdown links with blacklisted URLs', async () => {
-      const input = 'Check out [example](https://preciousplastic.com)';
-      const result = await applyFilters(input, default_filters_markdown);
-      expect(result).toBe('Check out example');
-    });
-  });
-
-  describe('validateLinks', () => {
-    it('should remove invalid links entirely', async () => {
-      const input = 'Check out [example](https://invalid-url-that-does-not-exist.com)';
-      const result = await validateLinks(input);
-      expect(result).toBe('Check out example');
-    });
-
-    it('should preserve valid links', async () => {
-      const input = 'Check out [example](https://example.com)';
-      const result = await validateLinks(input);
-      expect(result).toBe('Check out [example](https://example.com)');
-    });
-
-    it('should handle multiple links in text', async () => {
-      const input = 'Check out [valid](https://example.com) and [invalid](https://invalid-url-that-does-not-exist.com)';
-      const result = await validateLinks(input);
-      expect(result).toBe('Check out [valid](https://example.com) and invalid');
-    });
-
-    it('should handle links with special characters', async () => {
-      const input = '[special](https://example.com/path?param=value#fragment)';
-      const result = await validateLinks(input);
-      expect(result).toBe('[special](https://example.com/path?param=value#fragment)');
-    });
-
-    it('should handle links with special characters that are invalid', async () => {
-      const input = '[special](https://invalid-url-that-does-not-exist.com/path?param=value#fragment)';
-      const result = await validateLinks(input);
-      expect(result).toBe('special');
-    });
-  });
-}); 
--- a/src/model/filters.ts
+++ b/src/model/filters.ts
@ -1,270 +0,0 @@
-process.env['NODE_TLS_REJECT_UNAUTHORIZED'] = '0';
-
-export * from './howto-model.js'
-import { HOWTO_ROOT } from "config/config.js";
-import { filterMarkdownLinks } from "../base/markdown.js";
-import { urlCache } from '../base/url-cache.js';
-import { meta } from '../base/url.js';
-
-interface Item {
-  data: {
-    slug: string;
-  };
-}
-export interface FilterFunction { (text: string): string | Promise<string> }
-export const blacklist: readonly string[] = [
-  'precious-plastic',
-  'fair-enough',
-  'mad-plastic-labs',
-  'easymoulds',
-  'plasticpreneur',
-  'sustainable-design-studio',
-  'johannplasto'
-] as const;
-
-export const urlBlacklist: readonly string[] = [
-  "thenounproject.com",
-  "preciousplastic.com",
-  "community.preciousplastic.com",
-  "bazar.preciousplastic.com",
-  "onearmy.earth"
-] as const;
-
-export const bannedWords: readonly string[] = [
-  "wizard",
-  "magic2",
-  "precious plastic",
-  "onearmy"
-] as const;
-
-export const wordReplaceMap: Readonly<Record<string, string>> = {
-  Router: "CNC Router",
-  "laptop stand": "laptoppie",
-  Car: "tufftuff"
-} as const;
-
-export const item_path = (item: Item): string => `${HOWTO_ROOT()}/${item.data.slug}`;
-/**
- * Shortens a URL by removing 'www.' prefix and trailing slashes
- * @param url - The URL to shorten
- * @returns The shortened URL or the original URL if invalid
- */
-export const shortenUrl = (url: string): string => {
-  try {
-    const { hostname, pathname, search } = new URL(url);
-    const cleanHost = hostname.replace(/^www\./, '');
-    const cleanPath = pathname.replace(/\/$/, '');
-    return `${cleanHost}${decodeURIComponent(cleanPath)}${search}`;
-  } catch (error) {
-    console.warn(`Invalid URL provided to shortenUrl: ${url}`);
-    return url;
-  }
-};
-
-/**
- * Gets the domain name from a URL
- * @param url - The URL to extract domain from
- * @returns The domain name or empty string if invalid
- */
-export const getDomain = (url: string): string => {
-  try {
-    const { hostname } = new URL(url);
-    return hostname.replace(/^www\./, '');
-  } catch {
-    return '';
-  }
-};
-
-
-export async function validateUrl(
-  url: string,
-  timeout: number = 22500
-): Promise<boolean> {
-  const controller = new AbortController();
-  const timer = setTimeout(() => controller.abort(), timeout);
-  try {
-    const response = await fetch(url, {
-      signal: controller.signal,
-      redirect: 'follow',
-      headers: {
-        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
-          + 'AppleWebKit/537.36 (KHTML, like Gecko) '
-          + 'Chrome/111.0.0.0 Safari/537.36',
-        'Accept-Language': 'en-US,en;q=0.9',
-        'Accept-Encoding': 'gzip, deflate, br',
-        'Connection': 'keep-alive',
-        'Sec-Fetch-Site': 'none',
-        'Sec-Fetch-Mode': 'navigate',
-        'Sec-Fetch-User': '?1',
-        'Sec-Fetch-Dest': 'document'
-      }
-    });
-
-    if (!response.ok || response.status === 404) {
-      console.log(`URL ${url} is 404`, response);
-      await urlCache.set(url, false);
-      return false;
-    }
-
-    // Get meta information for valid URLs
-    const metaInfo = await meta(url);
-    await urlCache.set(url, true, metaInfo);
-    return true;
-  } catch (error) {
-    console.log(`Error validateUrl ${url}`, error);
-    await urlCache.set(url, false);
-    return false;
-  } finally {
-    clearTimeout(timer);
-  }
-}
-/**
- * Validates if a URL is accessible with a timeout
- * @param url - The URL to validate
- * @param timeoutMs - Timeout in milliseconds (default: 3500)
- * @returns Promise resolving to true if link is valid, false otherwise
- */
-async function validateUrl_0(url: string, timeoutMs: number = 10500): Promise<boolean> {
-  try {
-    const controller = new AbortController();
-    const timeoutId = setTimeout(() => controller.abort(), timeoutMs);
-
-    const response = await fetch(url, {
-      method: 'HEAD',
-      signal: controller.signal,
-      mode: 'no-cors' // This allows checking cross-origin links
-    });
-
-    clearTimeout(timeoutId);
-
-    // For no-cors mode, we can't check the status, so we assume success if we get a response
-    if (response.type === 'opaque') {
-      return true;
-    }
-
-    // Check if status is in 2xx range
-    return response.ok;
-  } catch (error) {
-    // Handle various error cases
-    if (error instanceof Error) {
-      // AbortError means timeout
-      if (error.name === 'AbortError') {
-        console.warn(`Timeout checking URL: ${url}`);
-        return false;
-      }
-      // Network errors or other fetch errors
-      console.warn(`Error checking URL ${url}: ${error.message}`);
-    }
-    return false;
-  }
-}
-
-/**
- * Validates links in text and removes invalid ones
- * @param text - The text containing links to validate
- * @returns Promise resolving to text with invalid links removed
- */
-export const validateLinks = async (text: string): Promise<string> => {
-  const urlRegex = /\[([^\]]+)\]\(([^)]+)\)/g;
-  const matches = text.matchAll(urlRegex);
-  let processedText = text;
-
-  for (const match of matches) {
-    const [fullMatch, linkText, url] = match;
-    try {
-      // Check cache first
-      const cachedResult = await urlCache.get(url);
-      if (cachedResult !== null) {
-        if (!cachedResult.isValid) {
-          processedText = processedText.replace(fullMatch, `~~[${linkText}](${url})~~`);
-        }
-        continue;
-      }
-
-      // Encode the URL to handle special characters
-      const encodedUrl = encodeURI(url);
-      const isValid = await validateUrl(encodedUrl);
-
-      // Add strikethrough for invalid links while preserving the link
-      if (!isValid) {
-        processedText = processedText.replace(fullMatch, `~~[${linkText}](${url})~~`);
-      }
-    } catch (error) {
-      // If there's an error checking the link, assume it's invalid
-      await urlCache.set(url, false);
-      processedText = processedText.replace(fullMatch, `~~[${linkText}](${url})~~`);
-    }
-  }
-
-  return processedText;
-};
-
-/**
- * Renders links in text, replacing blacklisted URLs with "[Link Removed]"
- * @param text - The text containing URLs to process
- * @returns Processed text with rendered links
- */
-export const renderLinks = (text: string): string =>
-  text.replace(/https?:\/\/[^\s<"]+/gi, (url) => {
-    const isBlacklisted = urlBlacklist.some((domain) =>
-      url.toLowerCase().includes(domain.toLowerCase())
-    );
-    if (isBlacklisted) return "";
-
-    const domain = getDomain(url);
-    const displayText = `${domain}: ${shortenUrl(url)}`;
-    return `<a class="text-orange-600 underline" href="${url}" target="_blank" rel="noopener noreferrer">${displayText}</a>`;
-  });
-
-/**
- * Filters out banned phrases from text
- * @param text - The text to filter
- * @returns Text with banned phrases replaced
- */
-export const filterBannedPhrases = (text: string): string =>
-  bannedWords.reduce(
-    (acc, word) => acc.replace(new RegExp(`\\b${word}\\b`, "gi"), "[filtered]"),
-    text
-  );
-
-/**
- * Replaces specific words in text according to the wordReplaceMap
- * @param text - The text to process
- * @returns Text with words replaced according to the mapping
- */
-export const replaceWords = (text: string): string =>
-  Object.entries(wordReplaceMap).reduce(
-    (acc, [word, replacement]) =>
-      acc.replace(new RegExp(`\\b${word}\\b`, "gi"), replacement),
-    text
-  );
-
-export const default_filters_plain: FilterFunction[] = [
-  renderLinks,
-  filterBannedPhrases,
-  replaceWords
-] as const;
-
-export const default_filters_markdown: FilterFunction[] = [
-  (text: string) => filterMarkdownLinks(text, urlBlacklist.map(url => ({ pattern: url, replacement: "" }))),
-  filterBannedPhrases,
-  replaceWords,
-  validateLinks
-] as const;
-
-/**
- * Applies all filters to the input text in sequence
- * @param text - The text to filter
- * @param filters - Array of filter functions to apply
- * @returns Promise resolving to the filtered text
- */
-export async function applyFilters(text: string = '', filters: FilterFunction[] = default_filters_plain): Promise<string> {
-  return filters.reduce(
-    async (promise, filterFn) => {
-      const currentText = await promise;
-      return filterFn(currentText);
-    },
-    Promise.resolve(text)
-  )
-}
-
--- a/src/model/howto.ts
+++ b/src/model/howto.ts
@ -12,10 +12,10 @@ import { sync as write } from '@polymech/fs/write'
 import type { Loader, LoaderContext } from 'astro/loaders'
 import { resolveVariables } from "@polymech/commons/variables"
 export * from './howto-model.js'
-export * from './filters.js'
+export * from '../base/filters.js'

 import { IHowto, IImage, ITag, ITEM_TYPE } from './howto-model.js'
-import { blacklist, default_filters_markdown } from './filters.js'
+import { blacklist, default_filters_markdown } from '../base/filters.js'
 import { download } from './download.js'

 import { filter } from "@/base/kbot.js"
@ -47,7 +47,7 @@ const NB_ITEMS = 10

 import { env, logger } from '@/base/index.js'

-import { applyFilters, default_filters_plain, FilterFunction } from './filters.js'
+import { applyFilters, default_filters_plain, FilterFunction } from '../base/filters.js'
 import { TemplateContext, buildPrompt, LLMConfig, createTemplates } from '@/base/kbot-templates.js';
 import { template_filter } from '@/base/kbot.js'
 export const item_path = (item: IHowto) => `${HOWTO_ROOT()}/${item.slug}`
--- a/src/model/product.ts
+++ b/src/model/product.ts
@ -16,7 +16,7 @@ import { env, logger } from '@/base/index.js'
 import { download } from './download.js'
 import { default_image } from 'config/config.js'

-import { applyFilters, default_filters_plain, FilterFunction } from './filters.js'
+import { applyFilters, default_filters_plain, FilterFunction } from '../base/filters.js'
 import { TemplateContext, buildPrompt, LLMConfig, createTemplates } from '@/base/kbot-templates.js'
 import { template_filter } from '@/base/kbot.js'