base:url cleanup

This commit is contained in:
lovebird 2025-03-29 11:21:11 +01:00
parent ed0d8b60f2
commit f01f52ca45
9 changed files with 7 additions and 557 deletions

View File

@ -24,7 +24,7 @@
"test:model": "vitest run src/model",
"test:model:watch": "vitest watch src/model",
"test:watch": "node --experimental-vm-modules node_modules/jest/bin/jest.js --watch",
"test:url": "vitest run src/base/url.test.ts src/base/link-preview.test.ts"
"test:url": "vitest run src/base/url.test.ts"
},
"dependencies": {
"@astrojs/compiler": "^2.10.4",

View File

@ -1,9 +0,0 @@
import { jest } from '@jest/globals';
export const meta = jest.fn().mockResolvedValue({
title: 'Test Title',
description: 'Test Description',
image: 'https://example.com/image.jpg',
favicon: 'https://example.com/favicon.ico',
siteName: 'Example Site'
});

View File

@ -1,105 +0,0 @@
import { jest } from '@jest/globals';
import { urlCache } from '../url-cache.js';
import { validateUrl } from '../../model/filters.js';
import fs from 'fs/promises';
import path from 'path';
jest.mock('../url.js');
// Mock fetch
const mockFetch = jest.fn().mockImplementation(
(): Promise<Response> =>
Promise.resolve(new Response(null, {
status: 200,
statusText: 'OK'
}))
);
(global as any).fetch = mockFetch;
const testMeta = {
title: 'Test Title',
description: 'Test Description',
image: 'https://example.com/image.jpg',
favicon: 'https://example.com/favicon.ico',
siteName: 'Example Site'
};
describe('UrlCache', () => {
const testUrl = 'https://example.com';
beforeEach(async () => {
// Clear cache before each test
await urlCache.clear();
// Reset fetch mock
mockFetch.mockClear();
});
afterAll(async () => {
// Clean up after all tests
await urlCache.clear();
});
test('should store and retrieve URL validity', async () => {
await urlCache.set(testUrl, true);
const result = await urlCache.get(testUrl);
expect(result).toBeTruthy();
expect(result?.isValid).toBe(true);
});
test('should store and retrieve meta information', async () => {
await urlCache.set(testUrl, true, testMeta);
const result = await urlCache.get(testUrl);
expect(result?.meta).toEqual(testMeta);
});
test('should handle invalid URLs', async () => {
await urlCache.set(testUrl, false);
const result = await urlCache.get(testUrl);
expect(result?.isValid).toBe(false);
});
test('should expire cache entries', async () => {
// Set a URL with a very old timestamp
const oldEntry = {
isValid: true,
timestamp: Date.now() - (8 * 24 * 60 * 60 * 1000), // 8 days old
meta: testMeta
};
const cacheFile = path.join(process.cwd(), '.cache', 'url-cache.json');
await fs.writeFile(cacheFile, JSON.stringify({ [testUrl]: oldEntry }));
const result = await urlCache.get(testUrl);
expect(result).toBeNull();
});
test('validateUrl should store meta information', async () => {
const isValid = await validateUrl(testUrl);
expect(isValid).toBe(true);
expect(mockFetch).toHaveBeenCalledWith(
testUrl,
expect.objectContaining({
signal: expect.any(AbortSignal),
redirect: 'follow'
})
);
const result = await urlCache.get(testUrl);
expect(result?.isValid).toBe(true);
expect(result?.meta).toEqual(testMeta);
});
test('expandUrls should add meta information to valid URLs without meta', async () => {
// Add a URL without meta info
await urlCache.set(testUrl, true);
let result = await urlCache.get(testUrl);
expect(result?.meta).toBeUndefined();
// Expand URLs
await urlCache.expandUrls();
// Check that meta info was added
result = await urlCache.get(testUrl);
expect(result?.meta).toEqual(testMeta);
});
});

View File

@ -14,7 +14,7 @@ import { sync as exists } from "@polymech/fs/exists";
import { sync as read } from "@polymech/fs/read";
import { createMarkdownComponent } from "@/base/index.js";
import { translate } from "@/base/i18n.js";
import { applyFilters, shortenUrl } from "@/model/howto.js";
import { applyFilters, shortenUrl } from "@/base/filters.js";
import {
HOWTO_FILES_WEB,
HOWTO_FILES_ABS,

View File

@ -14,7 +14,7 @@ import { sync as exists } from "@polymech/fs/exists";
import { sync as read } from "@polymech/fs/read";
import { createHTMLComponent, createMarkdownComponent } from "@/base/index.js";
import { translate } from "@/base/i18n.js";
import { applyFilters, shortenUrl } from "@/model/howto.js";
import { applyFilters, shortenUrl } from "@/base/filters.js";
// import { extract, extract_learned_skills, references } from "@/base/kbot.js";
import {
HOWTO_FILES_WEB,

View File

@ -1,166 +0,0 @@
import './test-setup.js';
import { describe, it, expect } from 'vitest';
import {
shortenUrl,
renderLinks,
filterBannedPhrases,
replaceWords,
applyFilters,
default_filters_plain,
default_filters_markdown,
item_path,
validateLinks
} from './filters.js';
describe('filters', () => {
describe('item_path', () => {
it('should generate correct path from item', () => {
const item = { data: { slug: 'test-slug' } };
expect(item_path(item)).toBe('/howto/test-slug');
});
});
describe('shortenUrl', () => {
it('should remove www. prefix and trailing slashes', () => {
expect(shortenUrl('https://www.example.com/path/')).toBe('example.com/path');
});
it('should handle URLs without www. prefix', () => {
expect(shortenUrl('https://example.com/path')).toBe('example.com/path');
});
it('should handle invalid URLs gracefully', () => {
expect(shortenUrl('invalid-url')).toBe('invalid-url');
});
it('should handle URLs with query parameters', () => {
expect(shortenUrl('https://example.com/path?param=value')).toBe('example.com/path?param=value');
});
});
describe('renderLinks', () => {
it('should render non-blacklisted links', () => {
const input = 'Check out https://example.com';
const expected = 'Check out <a class="text-orange-600 underline" href="https://example.com" target="_blank" rel="noopener noreferrer">example.com</a>';
expect(renderLinks(input)).toBe(expected);
});
it('should replace blacklisted links with empty string', () => {
const input = 'Check out https://preciousplastic.com';
expect(renderLinks(input)).toBe('Check out ');
});
it('should handle multiple links in text', () => {
const input = 'Check out https://example.com and https://preciousplastic.com';
const result = renderLinks(input);
expect(result).toContain('example.com');
expect(result).toContain('and ');
});
});
describe('filterBannedPhrases', () => {
it('should replace banned words with [filtered]', () => {
const input = 'The wizard used magic2';
const expected = 'The [filtered] used [filtered]';
expect(filterBannedPhrases(input)).toBe(expected);
});
it('should handle case-insensitive matching', () => {
const input = 'The WIZARD used MAGIC2';
const expected = 'The [filtered] used [filtered]';
expect(filterBannedPhrases(input)).toBe(expected);
});
it('should not replace partial matches', () => {
const input = 'The wizardry used magic2.0';
const expected = 'The wizardry used [filtered].0';
expect(filterBannedPhrases(input)).toBe(expected);
});
});
describe('replaceWords', () => {
it('should replace words according to wordReplaceMap', () => {
const input = 'I need a Router for my Car';
const expected = 'I need a CNC Router for my tufftuff';
expect(replaceWords(input)).toBe(expected);
});
it('should handle multi-word replacements', () => {
const input = 'I need a laptop stand';
expect(replaceWords(input)).toBe('I need a laptoppie');
});
it('should handle case-insensitive matching', () => {
const input = 'I need a ROUTER for my CAR';
const expected = 'I need a CNC Router for my tufftuff';
expect(replaceWords(input)).toBe(expected);
});
});
describe('applyFilters', () => {
it('should apply plain text filters in sequence', async () => {
const input = 'Check out https://example.com with the wizard Router';
const result = await applyFilters(input, default_filters_plain);
expect(result).toContain('example.com');
expect(result).toContain('[filtered]');
expect(result).toContain('CNC Router');
});
it('should apply markdown filters in sequence', async () => {
const input = 'Check out [example](https://example.com) with the wizard Router';
const result = await applyFilters(input, default_filters_markdown);
expect(result).toContain('example');
expect(result).toContain('[filtered]');
expect(result).toContain('CNC Router');
});
it('should handle empty input', async () => {
expect(await applyFilters('')).toBe('');
});
it('should handle custom filter array', async () => {
const customFilters = [filterBannedPhrases];
const input = 'The wizard used magic2';
const expected = 'The [filtered] used [filtered]';
expect(await applyFilters(input, customFilters)).toBe(expected);
});
it('should handle markdown links with blacklisted URLs', async () => {
const input = 'Check out [example](https://preciousplastic.com)';
const result = await applyFilters(input, default_filters_markdown);
expect(result).toBe('Check out example');
});
});
describe('validateLinks', () => {
it('should remove invalid links entirely', async () => {
const input = 'Check out [example](https://invalid-url-that-does-not-exist.com)';
const result = await validateLinks(input);
expect(result).toBe('Check out example');
});
it('should preserve valid links', async () => {
const input = 'Check out [example](https://example.com)';
const result = await validateLinks(input);
expect(result).toBe('Check out [example](https://example.com)');
});
it('should handle multiple links in text', async () => {
const input = 'Check out [valid](https://example.com) and [invalid](https://invalid-url-that-does-not-exist.com)';
const result = await validateLinks(input);
expect(result).toBe('Check out [valid](https://example.com) and invalid');
});
it('should handle links with special characters', async () => {
const input = '[special](https://example.com/path?param=value#fragment)';
const result = await validateLinks(input);
expect(result).toBe('[special](https://example.com/path?param=value#fragment)');
});
it('should handle links with special characters that are invalid', async () => {
const input = '[special](https://invalid-url-that-does-not-exist.com/path?param=value#fragment)';
const result = await validateLinks(input);
expect(result).toBe('special');
});
});
});

View File

@ -1,270 +0,0 @@
process.env['NODE_TLS_REJECT_UNAUTHORIZED'] = '0';
export * from './howto-model.js'
import { HOWTO_ROOT } from "config/config.js";
import { filterMarkdownLinks } from "../base/markdown.js";
import { urlCache } from '../base/url-cache.js';
import { meta } from '../base/url.js';
interface Item {
data: {
slug: string;
};
}
export interface FilterFunction { (text: string): string | Promise<string> }
export const blacklist: readonly string[] = [
'precious-plastic',
'fair-enough',
'mad-plastic-labs',
'easymoulds',
'plasticpreneur',
'sustainable-design-studio',
'johannplasto'
] as const;
export const urlBlacklist: readonly string[] = [
"thenounproject.com",
"preciousplastic.com",
"community.preciousplastic.com",
"bazar.preciousplastic.com",
"onearmy.earth"
] as const;
export const bannedWords: readonly string[] = [
"wizard",
"magic2",
"precious plastic",
"onearmy"
] as const;
export const wordReplaceMap: Readonly<Record<string, string>> = {
Router: "CNC Router",
"laptop stand": "laptoppie",
Car: "tufftuff"
} as const;
export const item_path = (item: Item): string => `${HOWTO_ROOT()}/${item.data.slug}`;
/**
* Shortens a URL by removing 'www.' prefix and trailing slashes
* @param url - The URL to shorten
* @returns The shortened URL or the original URL if invalid
*/
export const shortenUrl = (url: string): string => {
try {
const { hostname, pathname, search } = new URL(url);
const cleanHost = hostname.replace(/^www\./, '');
const cleanPath = pathname.replace(/\/$/, '');
return `${cleanHost}${decodeURIComponent(cleanPath)}${search}`;
} catch (error) {
console.warn(`Invalid URL provided to shortenUrl: ${url}`);
return url;
}
};
/**
* Gets the domain name from a URL
* @param url - The URL to extract domain from
* @returns The domain name or empty string if invalid
*/
export const getDomain = (url: string): string => {
try {
const { hostname } = new URL(url);
return hostname.replace(/^www\./, '');
} catch {
return '';
}
};
export async function validateUrl(
url: string,
timeout: number = 22500
): Promise<boolean> {
const controller = new AbortController();
const timer = setTimeout(() => controller.abort(), timeout);
try {
const response = await fetch(url, {
signal: controller.signal,
redirect: 'follow',
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
+ 'AppleWebKit/537.36 (KHTML, like Gecko) '
+ 'Chrome/111.0.0.0 Safari/537.36',
'Accept-Language': 'en-US,en;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-User': '?1',
'Sec-Fetch-Dest': 'document'
}
});
if (!response.ok || response.status === 404) {
console.log(`URL ${url} is 404`, response);
await urlCache.set(url, false);
return false;
}
// Get meta information for valid URLs
const metaInfo = await meta(url);
await urlCache.set(url, true, metaInfo);
return true;
} catch (error) {
console.log(`Error validateUrl ${url}`, error);
await urlCache.set(url, false);
return false;
} finally {
clearTimeout(timer);
}
}
/**
* Validates if a URL is accessible with a timeout
* @param url - The URL to validate
* @param timeoutMs - Timeout in milliseconds (default: 3500)
* @returns Promise resolving to true if link is valid, false otherwise
*/
async function validateUrl_0(url: string, timeoutMs: number = 10500): Promise<boolean> {
try {
const controller = new AbortController();
const timeoutId = setTimeout(() => controller.abort(), timeoutMs);
const response = await fetch(url, {
method: 'HEAD',
signal: controller.signal,
mode: 'no-cors' // This allows checking cross-origin links
});
clearTimeout(timeoutId);
// For no-cors mode, we can't check the status, so we assume success if we get a response
if (response.type === 'opaque') {
return true;
}
// Check if status is in 2xx range
return response.ok;
} catch (error) {
// Handle various error cases
if (error instanceof Error) {
// AbortError means timeout
if (error.name === 'AbortError') {
console.warn(`Timeout checking URL: ${url}`);
return false;
}
// Network errors or other fetch errors
console.warn(`Error checking URL ${url}: ${error.message}`);
}
return false;
}
}
/**
* Validates links in text and removes invalid ones
* @param text - The text containing links to validate
* @returns Promise resolving to text with invalid links removed
*/
export const validateLinks = async (text: string): Promise<string> => {
const urlRegex = /\[([^\]]+)\]\(([^)]+)\)/g;
const matches = text.matchAll(urlRegex);
let processedText = text;
for (const match of matches) {
const [fullMatch, linkText, url] = match;
try {
// Check cache first
const cachedResult = await urlCache.get(url);
if (cachedResult !== null) {
if (!cachedResult.isValid) {
processedText = processedText.replace(fullMatch, `~~[${linkText}](${url})~~`);
}
continue;
}
// Encode the URL to handle special characters
const encodedUrl = encodeURI(url);
const isValid = await validateUrl(encodedUrl);
// Add strikethrough for invalid links while preserving the link
if (!isValid) {
processedText = processedText.replace(fullMatch, `~~[${linkText}](${url})~~`);
}
} catch (error) {
// If there's an error checking the link, assume it's invalid
await urlCache.set(url, false);
processedText = processedText.replace(fullMatch, `~~[${linkText}](${url})~~`);
}
}
return processedText;
};
/**
* Renders links in text, replacing blacklisted URLs with "[Link Removed]"
* @param text - The text containing URLs to process
* @returns Processed text with rendered links
*/
export const renderLinks = (text: string): string =>
text.replace(/https?:\/\/[^\s<"]+/gi, (url) => {
const isBlacklisted = urlBlacklist.some((domain) =>
url.toLowerCase().includes(domain.toLowerCase())
);
if (isBlacklisted) return "";
const domain = getDomain(url);
const displayText = `${domain}: ${shortenUrl(url)}`;
return `<a class="text-orange-600 underline" href="${url}" target="_blank" rel="noopener noreferrer">${displayText}</a>`;
});
/**
* Filters out banned phrases from text
* @param text - The text to filter
* @returns Text with banned phrases replaced
*/
export const filterBannedPhrases = (text: string): string =>
bannedWords.reduce(
(acc, word) => acc.replace(new RegExp(`\\b${word}\\b`, "gi"), "[filtered]"),
text
);
/**
* Replaces specific words in text according to the wordReplaceMap
* @param text - The text to process
* @returns Text with words replaced according to the mapping
*/
export const replaceWords = (text: string): string =>
Object.entries(wordReplaceMap).reduce(
(acc, [word, replacement]) =>
acc.replace(new RegExp(`\\b${word}\\b`, "gi"), replacement),
text
);
export const default_filters_plain: FilterFunction[] = [
renderLinks,
filterBannedPhrases,
replaceWords
] as const;
export const default_filters_markdown: FilterFunction[] = [
(text: string) => filterMarkdownLinks(text, urlBlacklist.map(url => ({ pattern: url, replacement: "" }))),
filterBannedPhrases,
replaceWords,
validateLinks
] as const;
/**
* Applies all filters to the input text in sequence
* @param text - The text to filter
* @param filters - Array of filter functions to apply
* @returns Promise resolving to the filtered text
*/
export async function applyFilters(text: string = '', filters: FilterFunction[] = default_filters_plain): Promise<string> {
return filters.reduce(
async (promise, filterFn) => {
const currentText = await promise;
return filterFn(currentText);
},
Promise.resolve(text)
)
}

View File

@ -12,10 +12,10 @@ import { sync as write } from '@polymech/fs/write'
import type { Loader, LoaderContext } from 'astro/loaders'
import { resolveVariables } from "@polymech/commons/variables"
export * from './howto-model.js'
export * from './filters.js'
export * from '../base/filters.js'
import { IHowto, IImage, ITag, ITEM_TYPE } from './howto-model.js'
import { blacklist, default_filters_markdown } from './filters.js'
import { blacklist, default_filters_markdown } from '../base/filters.js'
import { download } from './download.js'
import { filter } from "@/base/kbot.js"
@ -47,7 +47,7 @@ const NB_ITEMS = 10
import { env, logger } from '@/base/index.js'
import { applyFilters, default_filters_plain, FilterFunction } from './filters.js'
import { applyFilters, default_filters_plain, FilterFunction } from '../base/filters.js'
import { TemplateContext, buildPrompt, LLMConfig, createTemplates } from '@/base/kbot-templates.js';
import { template_filter } from '@/base/kbot.js'
export const item_path = (item: IHowto) => `${HOWTO_ROOT()}/${item.slug}`

View File

@ -16,7 +16,7 @@ import { env, logger } from '@/base/index.js'
import { download } from './download.js'
import { default_image } from 'config/config.js'
import { applyFilters, default_filters_plain, FilterFunction } from './filters.js'
import { applyFilters, default_filters_plain, FilterFunction } from '../base/filters.js'
import { TemplateContext, buildPrompt, LLMConfig, createTemplates } from '@/base/kbot-templates.js'
import { template_filter } from '@/base/kbot.js'