From f01f52ca45a2bb8fcbf9c04fb8cde3dfdf9e85a3 Mon Sep 17 00:00:00 2001 From: babayaga Date: Sat, 29 Mar 2025 11:21:11 +0100 Subject: [PATCH] base:url cleanup --- package.json | 2 +- src/base/__tests__/__mocks__/url.js | 9 - src/base/__tests__/url-cache.test.ts | 105 ----------- src/components/howtos/Detail.astro | 2 +- src/components/howtos/Detail2.astro | 2 +- src/model/filters.test.ts | 166 ---------------- src/model/filters.ts | 270 --------------------------- src/model/howto.ts | 6 +- src/model/product.ts | 2 +- 9 files changed, 7 insertions(+), 557 deletions(-) delete mode 100644 src/base/__tests__/__mocks__/url.js delete mode 100644 src/base/__tests__/url-cache.test.ts delete mode 100644 src/model/filters.test.ts delete mode 100644 src/model/filters.ts diff --git a/package.json b/package.json index 9831a8d..8ae5371 100644 --- a/package.json +++ b/package.json @@ -24,7 +24,7 @@ "test:model": "vitest run src/model", "test:model:watch": "vitest watch src/model", "test:watch": "node --experimental-vm-modules node_modules/jest/bin/jest.js --watch", - "test:url": "vitest run src/base/url.test.ts src/base/link-preview.test.ts" + "test:url": "vitest run src/base/url.test.ts" }, "dependencies": { "@astrojs/compiler": "^2.10.4", diff --git a/src/base/__tests__/__mocks__/url.js b/src/base/__tests__/__mocks__/url.js deleted file mode 100644 index e0d4465..0000000 --- a/src/base/__tests__/__mocks__/url.js +++ /dev/null @@ -1,9 +0,0 @@ -import { jest } from '@jest/globals'; - -export const meta = jest.fn().mockResolvedValue({ - title: 'Test Title', - description: 'Test Description', - image: 'https://example.com/image.jpg', - favicon: 'https://example.com/favicon.ico', - siteName: 'Example Site' -}); \ No newline at end of file diff --git a/src/base/__tests__/url-cache.test.ts b/src/base/__tests__/url-cache.test.ts deleted file mode 100644 index b77a13b..0000000 --- a/src/base/__tests__/url-cache.test.ts +++ /dev/null @@ -1,105 +0,0 @@ -import { jest } from '@jest/globals'; -import { urlCache } from '../url-cache.js'; -import { validateUrl } from '../../model/filters.js'; -import fs from 'fs/promises'; -import path from 'path'; - -jest.mock('../url.js'); - -// Mock fetch -const mockFetch = jest.fn().mockImplementation( - (): Promise => - Promise.resolve(new Response(null, { - status: 200, - statusText: 'OK' - })) -); -(global as any).fetch = mockFetch; - -const testMeta = { - title: 'Test Title', - description: 'Test Description', - image: 'https://example.com/image.jpg', - favicon: 'https://example.com/favicon.ico', - siteName: 'Example Site' -}; - -describe('UrlCache', () => { - const testUrl = 'https://example.com'; - - beforeEach(async () => { - // Clear cache before each test - await urlCache.clear(); - // Reset fetch mock - mockFetch.mockClear(); - }); - - afterAll(async () => { - // Clean up after all tests - await urlCache.clear(); - }); - - test('should store and retrieve URL validity', async () => { - await urlCache.set(testUrl, true); - const result = await urlCache.get(testUrl); - expect(result).toBeTruthy(); - expect(result?.isValid).toBe(true); - }); - - test('should store and retrieve meta information', async () => { - await urlCache.set(testUrl, true, testMeta); - const result = await urlCache.get(testUrl); - expect(result?.meta).toEqual(testMeta); - }); - - test('should handle invalid URLs', async () => { - await urlCache.set(testUrl, false); - const result = await urlCache.get(testUrl); - expect(result?.isValid).toBe(false); - }); - - test('should expire cache entries', async () => { - // Set a URL with a very old timestamp - const oldEntry = { - isValid: true, - timestamp: Date.now() - (8 * 24 * 60 * 60 * 1000), // 8 days old - meta: testMeta - }; - - const cacheFile = path.join(process.cwd(), '.cache', 'url-cache.json'); - await fs.writeFile(cacheFile, JSON.stringify({ [testUrl]: oldEntry })); - - const result = await urlCache.get(testUrl); - expect(result).toBeNull(); - }); - - test('validateUrl should store meta information', async () => { - const isValid = await validateUrl(testUrl); - expect(isValid).toBe(true); - expect(mockFetch).toHaveBeenCalledWith( - testUrl, - expect.objectContaining({ - signal: expect.any(AbortSignal), - redirect: 'follow' - }) - ); - - const result = await urlCache.get(testUrl); - expect(result?.isValid).toBe(true); - expect(result?.meta).toEqual(testMeta); - }); - - test('expandUrls should add meta information to valid URLs without meta', async () => { - // Add a URL without meta info - await urlCache.set(testUrl, true); - let result = await urlCache.get(testUrl); - expect(result?.meta).toBeUndefined(); - - // Expand URLs - await urlCache.expandUrls(); - - // Check that meta info was added - result = await urlCache.get(testUrl); - expect(result?.meta).toEqual(testMeta); - }); -}); \ No newline at end of file diff --git a/src/components/howtos/Detail.astro b/src/components/howtos/Detail.astro index 84f5d9e..e942cd6 100644 --- a/src/components/howtos/Detail.astro +++ b/src/components/howtos/Detail.astro @@ -14,7 +14,7 @@ import { sync as exists } from "@polymech/fs/exists"; import { sync as read } from "@polymech/fs/read"; import { createMarkdownComponent } from "@/base/index.js"; import { translate } from "@/base/i18n.js"; -import { applyFilters, shortenUrl } from "@/model/howto.js"; +import { applyFilters, shortenUrl } from "@/base/filters.js"; import { HOWTO_FILES_WEB, HOWTO_FILES_ABS, diff --git a/src/components/howtos/Detail2.astro b/src/components/howtos/Detail2.astro index 7db93f4..7e27622 100644 --- a/src/components/howtos/Detail2.astro +++ b/src/components/howtos/Detail2.astro @@ -14,7 +14,7 @@ import { sync as exists } from "@polymech/fs/exists"; import { sync as read } from "@polymech/fs/read"; import { createHTMLComponent, createMarkdownComponent } from "@/base/index.js"; import { translate } from "@/base/i18n.js"; -import { applyFilters, shortenUrl } from "@/model/howto.js"; +import { applyFilters, shortenUrl } from "@/base/filters.js"; // import { extract, extract_learned_skills, references } from "@/base/kbot.js"; import { HOWTO_FILES_WEB, diff --git a/src/model/filters.test.ts b/src/model/filters.test.ts deleted file mode 100644 index b6ba5e0..0000000 --- a/src/model/filters.test.ts +++ /dev/null @@ -1,166 +0,0 @@ -import './test-setup.js'; -import { describe, it, expect } from 'vitest'; -import { - shortenUrl, - renderLinks, - filterBannedPhrases, - replaceWords, - applyFilters, - default_filters_plain, - default_filters_markdown, - item_path, - validateLinks -} from './filters.js'; - -describe('filters', () => { - describe('item_path', () => { - it('should generate correct path from item', () => { - const item = { data: { slug: 'test-slug' } }; - expect(item_path(item)).toBe('/howto/test-slug'); - }); - }); - - describe('shortenUrl', () => { - it('should remove www. prefix and trailing slashes', () => { - expect(shortenUrl('https://www.example.com/path/')).toBe('example.com/path'); - }); - - it('should handle URLs without www. prefix', () => { - expect(shortenUrl('https://example.com/path')).toBe('example.com/path'); - }); - - it('should handle invalid URLs gracefully', () => { - expect(shortenUrl('invalid-url')).toBe('invalid-url'); - }); - - it('should handle URLs with query parameters', () => { - expect(shortenUrl('https://example.com/path?param=value')).toBe('example.com/path?param=value'); - }); - }); - - describe('renderLinks', () => { - it('should render non-blacklisted links', () => { - const input = 'Check out https://example.com'; - const expected = 'Check out example.com'; - expect(renderLinks(input)).toBe(expected); - }); - - it('should replace blacklisted links with empty string', () => { - const input = 'Check out https://preciousplastic.com'; - expect(renderLinks(input)).toBe('Check out '); - }); - - it('should handle multiple links in text', () => { - const input = 'Check out https://example.com and https://preciousplastic.com'; - const result = renderLinks(input); - expect(result).toContain('example.com'); - expect(result).toContain('and '); - }); - }); - - describe('filterBannedPhrases', () => { - it('should replace banned words with [filtered]', () => { - const input = 'The wizard used magic2'; - const expected = 'The [filtered] used [filtered]'; - expect(filterBannedPhrases(input)).toBe(expected); - }); - - it('should handle case-insensitive matching', () => { - const input = 'The WIZARD used MAGIC2'; - const expected = 'The [filtered] used [filtered]'; - expect(filterBannedPhrases(input)).toBe(expected); - }); - - it('should not replace partial matches', () => { - const input = 'The wizardry used magic2.0'; - const expected = 'The wizardry used [filtered].0'; - expect(filterBannedPhrases(input)).toBe(expected); - }); - }); - - describe('replaceWords', () => { - it('should replace words according to wordReplaceMap', () => { - const input = 'I need a Router for my Car'; - const expected = 'I need a CNC Router for my tufftuff'; - expect(replaceWords(input)).toBe(expected); - }); - - it('should handle multi-word replacements', () => { - const input = 'I need a laptop stand'; - expect(replaceWords(input)).toBe('I need a laptoppie'); - }); - - it('should handle case-insensitive matching', () => { - const input = 'I need a ROUTER for my CAR'; - const expected = 'I need a CNC Router for my tufftuff'; - expect(replaceWords(input)).toBe(expected); - }); - }); - - describe('applyFilters', () => { - it('should apply plain text filters in sequence', async () => { - const input = 'Check out https://example.com with the wizard Router'; - const result = await applyFilters(input, default_filters_plain); - expect(result).toContain('example.com'); - expect(result).toContain('[filtered]'); - expect(result).toContain('CNC Router'); - }); - - it('should apply markdown filters in sequence', async () => { - const input = 'Check out [example](https://example.com) with the wizard Router'; - const result = await applyFilters(input, default_filters_markdown); - expect(result).toContain('example'); - expect(result).toContain('[filtered]'); - expect(result).toContain('CNC Router'); - }); - - it('should handle empty input', async () => { - expect(await applyFilters('')).toBe(''); - }); - - it('should handle custom filter array', async () => { - const customFilters = [filterBannedPhrases]; - const input = 'The wizard used magic2'; - const expected = 'The [filtered] used [filtered]'; - expect(await applyFilters(input, customFilters)).toBe(expected); - }); - - it('should handle markdown links with blacklisted URLs', async () => { - const input = 'Check out [example](https://preciousplastic.com)'; - const result = await applyFilters(input, default_filters_markdown); - expect(result).toBe('Check out example'); - }); - }); - - describe('validateLinks', () => { - it('should remove invalid links entirely', async () => { - const input = 'Check out [example](https://invalid-url-that-does-not-exist.com)'; - const result = await validateLinks(input); - expect(result).toBe('Check out example'); - }); - - it('should preserve valid links', async () => { - const input = 'Check out [example](https://example.com)'; - const result = await validateLinks(input); - expect(result).toBe('Check out [example](https://example.com)'); - }); - - it('should handle multiple links in text', async () => { - const input = 'Check out [valid](https://example.com) and [invalid](https://invalid-url-that-does-not-exist.com)'; - const result = await validateLinks(input); - expect(result).toBe('Check out [valid](https://example.com) and invalid'); - }); - - it('should handle links with special characters', async () => { - const input = '[special](https://example.com/path?param=value#fragment)'; - const result = await validateLinks(input); - expect(result).toBe('[special](https://example.com/path?param=value#fragment)'); - }); - - it('should handle links with special characters that are invalid', async () => { - const input = '[special](https://invalid-url-that-does-not-exist.com/path?param=value#fragment)'; - const result = await validateLinks(input); - expect(result).toBe('special'); - }); - }); -}); \ No newline at end of file diff --git a/src/model/filters.ts b/src/model/filters.ts deleted file mode 100644 index dc616a9..0000000 --- a/src/model/filters.ts +++ /dev/null @@ -1,270 +0,0 @@ -process.env['NODE_TLS_REJECT_UNAUTHORIZED'] = '0'; - -export * from './howto-model.js' -import { HOWTO_ROOT } from "config/config.js"; -import { filterMarkdownLinks } from "../base/markdown.js"; -import { urlCache } from '../base/url-cache.js'; -import { meta } from '../base/url.js'; - -interface Item { - data: { - slug: string; - }; -} -export interface FilterFunction { (text: string): string | Promise } -export const blacklist: readonly string[] = [ - 'precious-plastic', - 'fair-enough', - 'mad-plastic-labs', - 'easymoulds', - 'plasticpreneur', - 'sustainable-design-studio', - 'johannplasto' -] as const; - -export const urlBlacklist: readonly string[] = [ - "thenounproject.com", - "preciousplastic.com", - "community.preciousplastic.com", - "bazar.preciousplastic.com", - "onearmy.earth" -] as const; - -export const bannedWords: readonly string[] = [ - "wizard", - "magic2", - "precious plastic", - "onearmy" -] as const; - -export const wordReplaceMap: Readonly> = { - Router: "CNC Router", - "laptop stand": "laptoppie", - Car: "tufftuff" -} as const; - -export const item_path = (item: Item): string => `${HOWTO_ROOT()}/${item.data.slug}`; -/** - * Shortens a URL by removing 'www.' prefix and trailing slashes - * @param url - The URL to shorten - * @returns The shortened URL or the original URL if invalid - */ -export const shortenUrl = (url: string): string => { - try { - const { hostname, pathname, search } = new URL(url); - const cleanHost = hostname.replace(/^www\./, ''); - const cleanPath = pathname.replace(/\/$/, ''); - return `${cleanHost}${decodeURIComponent(cleanPath)}${search}`; - } catch (error) { - console.warn(`Invalid URL provided to shortenUrl: ${url}`); - return url; - } -}; - -/** - * Gets the domain name from a URL - * @param url - The URL to extract domain from - * @returns The domain name or empty string if invalid - */ -export const getDomain = (url: string): string => { - try { - const { hostname } = new URL(url); - return hostname.replace(/^www\./, ''); - } catch { - return ''; - } -}; - - -export async function validateUrl( - url: string, - timeout: number = 22500 -): Promise { - const controller = new AbortController(); - const timer = setTimeout(() => controller.abort(), timeout); - try { - const response = await fetch(url, { - signal: controller.signal, - redirect: 'follow', - headers: { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' - + 'AppleWebKit/537.36 (KHTML, like Gecko) ' - + 'Chrome/111.0.0.0 Safari/537.36', - 'Accept-Language': 'en-US,en;q=0.9', - 'Accept-Encoding': 'gzip, deflate, br', - 'Connection': 'keep-alive', - 'Sec-Fetch-Site': 'none', - 'Sec-Fetch-Mode': 'navigate', - 'Sec-Fetch-User': '?1', - 'Sec-Fetch-Dest': 'document' - } - }); - - if (!response.ok || response.status === 404) { - console.log(`URL ${url} is 404`, response); - await urlCache.set(url, false); - return false; - } - - // Get meta information for valid URLs - const metaInfo = await meta(url); - await urlCache.set(url, true, metaInfo); - return true; - } catch (error) { - console.log(`Error validateUrl ${url}`, error); - await urlCache.set(url, false); - return false; - } finally { - clearTimeout(timer); - } -} -/** - * Validates if a URL is accessible with a timeout - * @param url - The URL to validate - * @param timeoutMs - Timeout in milliseconds (default: 3500) - * @returns Promise resolving to true if link is valid, false otherwise - */ -async function validateUrl_0(url: string, timeoutMs: number = 10500): Promise { - try { - const controller = new AbortController(); - const timeoutId = setTimeout(() => controller.abort(), timeoutMs); - - const response = await fetch(url, { - method: 'HEAD', - signal: controller.signal, - mode: 'no-cors' // This allows checking cross-origin links - }); - - clearTimeout(timeoutId); - - // For no-cors mode, we can't check the status, so we assume success if we get a response - if (response.type === 'opaque') { - return true; - } - - // Check if status is in 2xx range - return response.ok; - } catch (error) { - // Handle various error cases - if (error instanceof Error) { - // AbortError means timeout - if (error.name === 'AbortError') { - console.warn(`Timeout checking URL: ${url}`); - return false; - } - // Network errors or other fetch errors - console.warn(`Error checking URL ${url}: ${error.message}`); - } - return false; - } -} - -/** - * Validates links in text and removes invalid ones - * @param text - The text containing links to validate - * @returns Promise resolving to text with invalid links removed - */ -export const validateLinks = async (text: string): Promise => { - const urlRegex = /\[([^\]]+)\]\(([^)]+)\)/g; - const matches = text.matchAll(urlRegex); - let processedText = text; - - for (const match of matches) { - const [fullMatch, linkText, url] = match; - try { - // Check cache first - const cachedResult = await urlCache.get(url); - if (cachedResult !== null) { - if (!cachedResult.isValid) { - processedText = processedText.replace(fullMatch, `~~[${linkText}](${url})~~`); - } - continue; - } - - // Encode the URL to handle special characters - const encodedUrl = encodeURI(url); - const isValid = await validateUrl(encodedUrl); - - // Add strikethrough for invalid links while preserving the link - if (!isValid) { - processedText = processedText.replace(fullMatch, `~~[${linkText}](${url})~~`); - } - } catch (error) { - // If there's an error checking the link, assume it's invalid - await urlCache.set(url, false); - processedText = processedText.replace(fullMatch, `~~[${linkText}](${url})~~`); - } - } - - return processedText; -}; - -/** - * Renders links in text, replacing blacklisted URLs with "[Link Removed]" - * @param text - The text containing URLs to process - * @returns Processed text with rendered links - */ -export const renderLinks = (text: string): string => - text.replace(/https?:\/\/[^\s<"]+/gi, (url) => { - const isBlacklisted = urlBlacklist.some((domain) => - url.toLowerCase().includes(domain.toLowerCase()) - ); - if (isBlacklisted) return ""; - - const domain = getDomain(url); - const displayText = `${domain}: ${shortenUrl(url)}`; - return `${displayText}`; - }); - -/** - * Filters out banned phrases from text - * @param text - The text to filter - * @returns Text with banned phrases replaced - */ -export const filterBannedPhrases = (text: string): string => - bannedWords.reduce( - (acc, word) => acc.replace(new RegExp(`\\b${word}\\b`, "gi"), "[filtered]"), - text - ); - -/** - * Replaces specific words in text according to the wordReplaceMap - * @param text - The text to process - * @returns Text with words replaced according to the mapping - */ -export const replaceWords = (text: string): string => - Object.entries(wordReplaceMap).reduce( - (acc, [word, replacement]) => - acc.replace(new RegExp(`\\b${word}\\b`, "gi"), replacement), - text - ); - -export const default_filters_plain: FilterFunction[] = [ - renderLinks, - filterBannedPhrases, - replaceWords -] as const; - -export const default_filters_markdown: FilterFunction[] = [ - (text: string) => filterMarkdownLinks(text, urlBlacklist.map(url => ({ pattern: url, replacement: "" }))), - filterBannedPhrases, - replaceWords, - validateLinks -] as const; - -/** - * Applies all filters to the input text in sequence - * @param text - The text to filter - * @param filters - Array of filter functions to apply - * @returns Promise resolving to the filtered text - */ -export async function applyFilters(text: string = '', filters: FilterFunction[] = default_filters_plain): Promise { - return filters.reduce( - async (promise, filterFn) => { - const currentText = await promise; - return filterFn(currentText); - }, - Promise.resolve(text) - ) -} - diff --git a/src/model/howto.ts b/src/model/howto.ts index 794e156..9d90eda 100644 --- a/src/model/howto.ts +++ b/src/model/howto.ts @@ -12,10 +12,10 @@ import { sync as write } from '@polymech/fs/write' import type { Loader, LoaderContext } from 'astro/loaders' import { resolveVariables } from "@polymech/commons/variables" export * from './howto-model.js' -export * from './filters.js' +export * from '../base/filters.js' import { IHowto, IImage, ITag, ITEM_TYPE } from './howto-model.js' -import { blacklist, default_filters_markdown } from './filters.js' +import { blacklist, default_filters_markdown } from '../base/filters.js' import { download } from './download.js' import { filter } from "@/base/kbot.js" @@ -47,7 +47,7 @@ const NB_ITEMS = 10 import { env, logger } from '@/base/index.js' -import { applyFilters, default_filters_plain, FilterFunction } from './filters.js' +import { applyFilters, default_filters_plain, FilterFunction } from '../base/filters.js' import { TemplateContext, buildPrompt, LLMConfig, createTemplates } from '@/base/kbot-templates.js'; import { template_filter } from '@/base/kbot.js' export const item_path = (item: IHowto) => `${HOWTO_ROOT()}/${item.slug}` diff --git a/src/model/product.ts b/src/model/product.ts index f726c27..3c5da89 100644 --- a/src/model/product.ts +++ b/src/model/product.ts @@ -16,7 +16,7 @@ import { env, logger } from '@/base/index.js' import { download } from './download.js' import { default_image } from 'config/config.js' -import { applyFilters, default_filters_plain, FilterFunction } from './filters.js' +import { applyFilters, default_filters_plain, FilterFunction } from '../base/filters.js' import { TemplateContext, buildPrompt, LLMConfig, createTemplates } from '@/base/kbot-templates.js' import { template_filter } from '@/base/kbot.js'