pm server - shared - server products 1/3
This commit is contained in:
parent
dff0a8a2d6
commit
f6592bb292
275
packages/shared/src/server/AbstractProduct.ts
Normal file
275
packages/shared/src/server/AbstractProduct.ts
Normal file
@ -0,0 +1,275 @@
|
||||
import EventEmitter from 'events';
|
||||
import { PgBoss } from 'pg-boss';
|
||||
import { createHash } from 'crypto';
|
||||
import { streamSSE } from 'hono/streaming';
|
||||
import { EventBus } from './EventBus.js';
|
||||
import { ProductErrorCode } from './enums.js';
|
||||
import { ProductError } from './errors.js';
|
||||
import { logger } from '../commons/logger.js';
|
||||
|
||||
export interface JobCreationEvent {
|
||||
queue: string;
|
||||
data: any;
|
||||
options: any;
|
||||
}
|
||||
|
||||
export interface StreamOptions<TData = any> {
|
||||
data: TData;
|
||||
userId: string;
|
||||
forceRefresh?: boolean;
|
||||
fetcher: (data: TData, userId: string) => Promise<any[]>;
|
||||
cacheChecker?: (hash: string) => Promise<any[] | null>;
|
||||
}
|
||||
|
||||
export abstract class AbstractProduct<TJobData = any> extends EventEmitter {
|
||||
abstract readonly id: string;
|
||||
abstract readonly jobOptions: any;
|
||||
abstract readonly actions: Record<string, any>;
|
||||
abstract readonly workers: any[];
|
||||
abstract readonly routes: any[];
|
||||
|
||||
protected boss: PgBoss | null = null;
|
||||
protected workerSubscriptions: string[] = [];
|
||||
|
||||
async start(boss: PgBoss) {
|
||||
try {
|
||||
this.boss = boss;
|
||||
await this.registerWorkers(boss);
|
||||
await this.onStart(boss);
|
||||
} catch (error: any) {
|
||||
throw new ProductError(ProductErrorCode.START_FAILED, {
|
||||
message: `Failed to start product ${this.id}: ${error.message}`,
|
||||
originalError: error
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
protected async onStart(boss: PgBoss) {
|
||||
// Optional hook for subclasses
|
||||
}
|
||||
|
||||
async stop() {
|
||||
try {
|
||||
await this.unregisterWorkers();
|
||||
await this.onStop();
|
||||
} catch (error: any) {
|
||||
throw new ProductError(ProductErrorCode.STOP_FAILED, {
|
||||
message: `Failed to stop product ${this.id}: ${error.message}`,
|
||||
originalError: error
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
protected async onStop() {
|
||||
// Optional hook
|
||||
}
|
||||
|
||||
async pause() {
|
||||
try {
|
||||
await this.unregisterWorkers();
|
||||
} catch (error: any) {
|
||||
throw new ProductError(ProductErrorCode.PAUSE_FAILED, {
|
||||
message: `Failed to pause product ${this.id}: ${error.message}`,
|
||||
originalError: error
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
async resume() {
|
||||
if (!this.boss) {
|
||||
throw new ProductError(ProductErrorCode.RESUME_FAILED, 'PgBoss not initialized');
|
||||
}
|
||||
try {
|
||||
await this.registerWorkers(this.boss);
|
||||
} catch (error: any) {
|
||||
throw new ProductError(ProductErrorCode.RESUME_FAILED, {
|
||||
message: `Failed to resume product ${this.id}: ${error.message}`,
|
||||
originalError: error
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
protected async registerWorkers(boss: PgBoss) {
|
||||
if (!this.workers) return;
|
||||
|
||||
for (const WorkerClass of this.workers) {
|
||||
try {
|
||||
// @ts-ignore
|
||||
const workerInstance = new WorkerClass();
|
||||
// Inject the EventBus so the worker can emit job events
|
||||
(workerInstance as any).emitter = EventBus;
|
||||
// Inject boss instance for advanced operations like cancellation check
|
||||
(workerInstance as any).boss = boss;
|
||||
|
||||
logger.info(`[${this.id}] Registering worker for queue: ${workerInstance.queueName}`);
|
||||
|
||||
await boss.createQueue(workerInstance.queueName, workerInstance.queueOptions);
|
||||
|
||||
const workOptions = (workerInstance as any).teamSize ? { teamSize: (workerInstance as any).teamSize } : {};
|
||||
const subscriptionId = await boss.work(workerInstance.queueName, workOptions as any, (job: any) => workerInstance.handler(job));
|
||||
this.workerSubscriptions.push(subscriptionId);
|
||||
} catch (error: any) {
|
||||
throw new ProductError(ProductErrorCode.WORKER_REGISTRATION_FAILED, {
|
||||
message: `Failed to register worker for ${this.id}: ${error.message}`,
|
||||
worker: WorkerClass.name
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
protected async unregisterWorkers() {
|
||||
if (!this.boss) return;
|
||||
|
||||
for (const subId of this.workerSubscriptions) {
|
||||
try {
|
||||
// @ts-ignore - Assuming offWork exists in PgBoss type or at runtime
|
||||
await this.boss.offWork(subId);
|
||||
} catch (error: any) {
|
||||
logger.warn(`[${this.id}] Failed to unregister worker subscription ${subId}: ${error.message}`);
|
||||
}
|
||||
}
|
||||
this.workerSubscriptions = [];
|
||||
}
|
||||
|
||||
async sendJob(queue: string, data: TJobData, options: any = {}) {
|
||||
if (!this.boss) {
|
||||
throw new ProductError(ProductErrorCode.JOB_SUBMISSION_FAILED, 'PgBoss not initialized');
|
||||
}
|
||||
const event: JobCreationEvent = { queue, data, options };
|
||||
// Emit event to allow subscribers to modify data/options
|
||||
EventBus.emit('job:create', event);
|
||||
|
||||
try {
|
||||
return await this.boss.send(queue, event.data, event.options);
|
||||
} catch (error: any) {
|
||||
throw new ProductError(ProductErrorCode.JOB_SUBMISSION_FAILED, {
|
||||
message: `Failed to send job to ${queue}: ${error.message}`,
|
||||
queue
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
async waitForJob(jobId: string, timeoutMs: number = 60000): Promise<any> {
|
||||
return new Promise((resolve, reject) => {
|
||||
const timer = setTimeout(() => {
|
||||
cleanup();
|
||||
reject(new ProductError(ProductErrorCode.JOB_TIMEOUT, { message: 'Job timeout', jobId }));
|
||||
}, timeoutMs);
|
||||
|
||||
const onComplete = (event: any) => {
|
||||
if (event.jobId === jobId) {
|
||||
cleanup();
|
||||
resolve(event.result);
|
||||
}
|
||||
};
|
||||
|
||||
const cleanup = () => {
|
||||
clearTimeout(timer);
|
||||
EventBus.off('job:complete', onComplete);
|
||||
};
|
||||
|
||||
EventBus.on('job:complete', onComplete);
|
||||
});
|
||||
}
|
||||
|
||||
async waitForHash(targetHash: string, timeoutMs: number = 60000): Promise<any> {
|
||||
return new Promise((resolve, reject) => {
|
||||
const timer = setTimeout(() => {
|
||||
cleanup();
|
||||
reject(new ProductError(ProductErrorCode.JOB_TIMEOUT, { message: 'Job timeout (hash wait)', hash: targetHash }));
|
||||
}, timeoutMs);
|
||||
|
||||
const onComplete = (event: any) => {
|
||||
if (!event.data) return;
|
||||
try {
|
||||
const eventHash = this.hash(event.data);
|
||||
if (eventHash === targetHash) {
|
||||
cleanup();
|
||||
resolve(event.result);
|
||||
}
|
||||
} catch (e) {
|
||||
// Ignore hashing errors (mismatched data types from other queues)
|
||||
}
|
||||
};
|
||||
|
||||
const cleanup = () => {
|
||||
clearTimeout(timer);
|
||||
EventBus.off('job:complete', onComplete);
|
||||
};
|
||||
|
||||
EventBus.on('job:complete', onComplete);
|
||||
});
|
||||
}
|
||||
|
||||
protected async handleStream(c: any, options: StreamOptions) {
|
||||
const { data, userId, forceRefresh, fetcher, cacheChecker } = options;
|
||||
|
||||
const inputHash = this.generateHash(data);
|
||||
|
||||
return streamSSE(c, async (stream) => {
|
||||
try {
|
||||
await stream.writeSSE({
|
||||
event: 'progress',
|
||||
data: JSON.stringify({ stage: 'starting', percent: 0 })
|
||||
});
|
||||
|
||||
if (!forceRefresh && cacheChecker) {
|
||||
await stream.writeSSE({
|
||||
event: 'progress',
|
||||
data: JSON.stringify({ stage: 'checking_cache', percent: 10 })
|
||||
});
|
||||
|
||||
const cached = await cacheChecker(inputHash);
|
||||
if (cached) {
|
||||
for (let i = 0; i < cached.length; i++) {
|
||||
await stream.writeSSE({
|
||||
event: 'result',
|
||||
data: JSON.stringify(cached[i])
|
||||
});
|
||||
}
|
||||
await stream.writeSSE({
|
||||
event: 'complete',
|
||||
data: JSON.stringify({ total: cached.length, cached: true })
|
||||
});
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
await stream.writeSSE({
|
||||
event: 'progress',
|
||||
data: JSON.stringify({ stage: 'fetching_from_api', percent: 20 })
|
||||
});
|
||||
|
||||
const results = await fetcher(data, userId);
|
||||
|
||||
for (let i = 0; i < results.length; i++) {
|
||||
await stream.writeSSE({
|
||||
event: 'result',
|
||||
data: JSON.stringify(results[i])
|
||||
});
|
||||
}
|
||||
|
||||
await stream.writeSSE({
|
||||
event: 'complete',
|
||||
data: JSON.stringify({ total: results.length, cached: false })
|
||||
});
|
||||
|
||||
} catch (error: any) {
|
||||
logger.error(error, `[${this.id}] Stream error`);
|
||||
await stream.writeSSE({
|
||||
event: 'error',
|
||||
data: JSON.stringify({ error: error.message || 'Internal Server Error' })
|
||||
});
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// Helper for hashing
|
||||
protected generateHash(params: any) {
|
||||
const normalizedInput = JSON.stringify(params, Object.keys(params).sort());
|
||||
return createHash('sha256').update(normalizedInput).digest('hex');
|
||||
}
|
||||
|
||||
abstract hash(data: TJobData): string;
|
||||
abstract meta(userId: string): any;
|
||||
}
|
||||
3
packages/shared/src/server/EventBus.ts
Normal file
3
packages/shared/src/server/EventBus.ts
Normal file
@ -0,0 +1,3 @@
|
||||
import EventEmitter from 'events';
|
||||
|
||||
export const EventBus = new EventEmitter();
|
||||
22
packages/shared/src/server/enums.ts
Normal file
22
packages/shared/src/server/enums.ts
Normal file
@ -0,0 +1,22 @@
|
||||
export enum ProductErrorCode {
|
||||
// Lifecycle Errors
|
||||
START_FAILED = 'PRODUCT_START_FAILED',
|
||||
STOP_FAILED = 'PRODUCT_STOP_FAILED',
|
||||
PAUSE_FAILED = 'PRODUCT_PAUSE_FAILED',
|
||||
RESUME_FAILED = 'PRODUCT_RESUME_FAILED',
|
||||
|
||||
// Worker Errors
|
||||
WORKER_REGISTRATION_FAILED = 'WORKER_REGISTRATION_FAILED',
|
||||
WORKER_NOT_FOUND = 'WORKER_NOT_FOUND',
|
||||
|
||||
// Job Errors
|
||||
JOB_SUBMISSION_FAILED = 'JOB_SUBMISSION_FAILED',
|
||||
JOB_TIMEOUT = 'JOB_TIMEOUT',
|
||||
|
||||
// Configuration Errors
|
||||
INVALID_CONFIG = 'INVALID_CONFIG',
|
||||
MISSING_DEPENDENCY = 'MISSING_DEPENDENCY',
|
||||
|
||||
// Generic
|
||||
UNKNOWN_ERROR = 'UNKNOWN_ERROR'
|
||||
}
|
||||
29
packages/shared/src/server/errors.ts
Normal file
29
packages/shared/src/server/errors.ts
Normal file
@ -0,0 +1,29 @@
|
||||
import { ProductErrorCode } from './enums.js';
|
||||
|
||||
export interface ProductErrorPayload {
|
||||
message: string;
|
||||
[key: string]: any;
|
||||
}
|
||||
|
||||
export class ProductError extends Error {
|
||||
public readonly code: ProductErrorCode;
|
||||
public readonly payload: ProductErrorPayload;
|
||||
|
||||
constructor(code: ProductErrorCode, payload: ProductErrorPayload | string) {
|
||||
const message = typeof payload === 'string' ? payload : payload.message;
|
||||
super(message);
|
||||
this.code = code;
|
||||
this.payload = typeof payload === 'string' ? { message: payload } : payload;
|
||||
|
||||
// Restore prototype chain
|
||||
Object.setPrototypeOf(this, new.target.prototype);
|
||||
}
|
||||
|
||||
toJSON() {
|
||||
return {
|
||||
code: this.code,
|
||||
message: this.message,
|
||||
payload: this.payload
|
||||
};
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,94 @@
|
||||
import { describe, it, expect, beforeAll } from 'vitest';
|
||||
import * as dotenv from 'dotenv';
|
||||
import path from 'path';
|
||||
|
||||
// Load env from server root
|
||||
dotenv.config({ path: path.resolve(__dirname, '../../../../.env') });
|
||||
|
||||
describe('PyGADM Integration', () => {
|
||||
let LocationsProduct: any;
|
||||
|
||||
beforeAll(async () => {
|
||||
const productModule = await import('../index.js');
|
||||
LocationsProduct = new productModule.LocationsProduct();
|
||||
});
|
||||
|
||||
it('should search for regions using python wrapper', async () => {
|
||||
const c = {
|
||||
req: {
|
||||
valid: () => ({ query: 'France' })
|
||||
},
|
||||
json: (data: any, status: number) => ({ data, status })
|
||||
};
|
||||
|
||||
const result = await LocationsProduct.handleGetRegionSearch(c);
|
||||
if (result.status !== 200) {
|
||||
console.error('PyGADM Search Error:', JSON.stringify(result, null, 2));
|
||||
}
|
||||
expect(result.status).toBe(200);
|
||||
expect(result.data).toBeDefined();
|
||||
// console.log('PyGADM Search Result:', JSON.stringify(result.data, null, 2));
|
||||
});
|
||||
|
||||
it('should search for regions with content_level', async () => {
|
||||
const c = {
|
||||
req: {
|
||||
valid: () => ({ query: 'France', content_level: '1' })
|
||||
},
|
||||
json: (data: any, status: number) => ({ data, status })
|
||||
};
|
||||
|
||||
const result = await LocationsProduct.handleGetRegionSearch(c);
|
||||
expect(result.status).toBe(200);
|
||||
expect(result.data).toBeDefined();
|
||||
});
|
||||
|
||||
it('should search for regions with geojson output', async () => {
|
||||
const c = {
|
||||
req: {
|
||||
valid: () => ({ query: 'France', content_level: '1', geojson: 'true' })
|
||||
},
|
||||
json: (data: any, status: number) => ({ data, status })
|
||||
};
|
||||
|
||||
const result = await LocationsProduct.handleGetRegionSearch(c);
|
||||
// console.log('PyGADM GeoJSON Search Result:', JSON.stringify(result.data, null, 2));
|
||||
expect(result.status).toBe(200);
|
||||
expect(result.data).toBeDefined();
|
||||
expect(result.data.type).toBe('FeatureCollection');
|
||||
expect(result.data.features).toBeDefined();
|
||||
expect(Array.isArray(result.data.features)).toBe(true);
|
||||
});
|
||||
|
||||
it('should get boundary using python wrapper', async () => {
|
||||
const c = {
|
||||
req: {
|
||||
valid: () => ({ id: 'FRA.1_1' }) // Region for Auvergne-Rhône-Alpes
|
||||
},
|
||||
json: (data: any, status: number) => ({ data, status })
|
||||
};
|
||||
|
||||
const result = await LocationsProduct.handleGetRegionBoundary(c);
|
||||
if (result.status === 200) {
|
||||
expect(result.data).toBeDefined();
|
||||
} else {
|
||||
console.log('PyGADM Boundary Result Status:', result.status, result.data);
|
||||
}
|
||||
});
|
||||
it('should get region names using python wrapper', async () => {
|
||||
const c = {
|
||||
req: {
|
||||
valid: () => ({ admin: 'FRA', content_level: '2' })
|
||||
},
|
||||
json: (data: any, status: number) => ({ data, status })
|
||||
};
|
||||
|
||||
const result = await LocationsProduct.handleGetRegionNames(c);
|
||||
if (result.status !== 200) {
|
||||
console.error('PyGADM Names Error:', JSON.stringify(result, null, 2));
|
||||
}
|
||||
expect(result.status).toBe(200);
|
||||
expect(result.data).toBeDefined();
|
||||
console.log('PyGADM Names Result (Subset):', JSON.stringify(result.data.data.slice(0, 2), null, 2));
|
||||
});
|
||||
});
|
||||
313
packages/shared/src/server/locations/__tests__/e2e.test.ts
Normal file
313
packages/shared/src/server/locations/__tests__/e2e.test.ts
Normal file
@ -0,0 +1,313 @@
|
||||
|
||||
import { describe, it, expect, beforeAll, afterAll } from 'vitest';
|
||||
import * as dotenv from 'dotenv';
|
||||
|
||||
import path from 'path';
|
||||
// Load env from server root
|
||||
dotenv.config({ path: path.resolve(__dirname, '../../../../.env') });
|
||||
import { PgBoss } from 'pg-boss';
|
||||
// import { submitJob } from '../../events.js';
|
||||
// Ensure registry is loaded to register subscriber
|
||||
import '../../registry.js';
|
||||
import { LOCATIONS_JOB_OPTIONS } from '../constants.js';
|
||||
|
||||
// Dynamic imports to ensure env vars are loaded before modules that use them
|
||||
let boss: PgBoss;
|
||||
let startBoss: any;
|
||||
let SearchWorker: any;
|
||||
let supabase: any;
|
||||
let logger: any;
|
||||
|
||||
describe('Locations Product E2E', () => {
|
||||
let worker: any;
|
||||
let TEST_USER_ID: string;
|
||||
const TEST_QUERY = 'plastichub';
|
||||
const TEST_LOCATION = 'Barcelona, Spain';
|
||||
|
||||
beforeAll(async () => {
|
||||
// Import modules now that env vars are loaded
|
||||
// Adjust paths relative to this file: src/products/locations/__tests__/e2e.test.ts
|
||||
const clientModule = await import('../../../jobs/boss/client.js');
|
||||
boss = clientModule.boss as PgBoss;
|
||||
startBoss = clientModule.startBoss;
|
||||
|
||||
// Import the worker from the product
|
||||
const workerModule = await import('../pgboss.js');
|
||||
SearchWorker = workerModule.LocationsWorker;
|
||||
|
||||
const supabaseModule = await import('../../../commons/supabase.js');
|
||||
supabase = supabaseModule.supabase;
|
||||
|
||||
const loggerModule = await import('../../../commons/logger.js');
|
||||
logger = loggerModule.logger;
|
||||
|
||||
|
||||
// Ensure boss is connected
|
||||
if (!boss) {
|
||||
throw new Error('PgBoss not initialized (check DATABASE_URL)');
|
||||
}
|
||||
|
||||
await startBoss();
|
||||
|
||||
// Initialize product (lifecycle start)
|
||||
// We need to import LocationsProduct to call start
|
||||
const productModule = await import('../index.js');
|
||||
const LocationsProduct = productModule.LocationsProduct;
|
||||
if ((LocationsProduct as any).start) {
|
||||
await (LocationsProduct as any).start(boss);
|
||||
}
|
||||
|
||||
// Authenticate as the test user
|
||||
const { data, error } = await supabase.auth.signInWithPassword({
|
||||
email: 'sales@plastic-hub.com',
|
||||
password: 'customer',
|
||||
});
|
||||
|
||||
if (error || !data.session) {
|
||||
console.error('Failed to sign in test user:', error);
|
||||
throw new Error('Failed to sign in test user');
|
||||
}
|
||||
|
||||
TEST_USER_ID = data.user.id;
|
||||
logger.info(`E2E Test: Authenticated as ${TEST_USER_ID}`);
|
||||
|
||||
// Clean up existing data
|
||||
const { error: deleteError } = await supabase
|
||||
.from('locations')
|
||||
.delete()
|
||||
.eq('user_id', TEST_USER_ID)
|
||||
.eq('title', 'Plastic Hub');
|
||||
|
||||
if (deleteError) {
|
||||
logger.warn('Failed to clean up existing locations:', deleteError);
|
||||
} else {
|
||||
logger.info('Cleaned up existing locations for test');
|
||||
}
|
||||
|
||||
// Register the worker in this process
|
||||
worker = new SearchWorker();
|
||||
logger.info(`Worker Queue Name: ${worker.queueName}`);
|
||||
await boss.createQueue(worker.queueName);
|
||||
await boss.work(worker.queueName, worker.handler.bind(worker));
|
||||
|
||||
logger.info('E2E Test: Worker started');
|
||||
});
|
||||
|
||||
afterAll(async () => {
|
||||
if (boss) {
|
||||
// Delete jobs from the queue to clean up
|
||||
// Note: pg-boss doesn't have a simple "delete all from queue" method exposed easily on the instance
|
||||
// but we can try to stop the boss which stops processing.
|
||||
// The user asked: "make sure that all jobs are being deleted after the test"
|
||||
// We can use raw SQL via supabase or just stop the boss.
|
||||
// Let's try to clear the queue using SQL if possible, or just rely on stop.
|
||||
// Actually, we can use boss.fetch() to drain the queue or just leave it.
|
||||
// But to be "clean", maybe we can delete the specific job we created if we had the ID.
|
||||
|
||||
await boss.stop();
|
||||
}
|
||||
});
|
||||
|
||||
it('should process a real search job and update Supabase', async () => {
|
||||
if (!boss) return;
|
||||
|
||||
// 1. Send Job
|
||||
const jobData = {
|
||||
query: TEST_QUERY,
|
||||
location: TEST_LOCATION,
|
||||
userId: TEST_USER_ID,
|
||||
filters: {
|
||||
concurrency: 1
|
||||
}
|
||||
};
|
||||
|
||||
let jobId: string | null = null;
|
||||
|
||||
try {
|
||||
// Use submitJob helper to trigger event-based metadata injection
|
||||
// Use submitJob helper to trigger event-based metadata injection
|
||||
// jobId = await submitJob(worker.queueName, jobData, {}, boss);
|
||||
jobId = await boss.send(worker.queueName, jobData);
|
||||
expect(jobId).toBeDefined();
|
||||
logger.info(`Job submitted: ${jobId}`);
|
||||
|
||||
// Wait for job to be processed.
|
||||
// Since we are running the worker in the same process, it should pick it up.
|
||||
// We can poll the database for the result.
|
||||
|
||||
let attempts = 0;
|
||||
const maxAttempts = 20; // 20 * 1s = 20s
|
||||
let found = false;
|
||||
|
||||
while (attempts < maxAttempts && !found) {
|
||||
await new Promise(resolve => setTimeout(resolve, 1000));
|
||||
const { data: locations } = await supabase
|
||||
.from('locations')
|
||||
.select('place_id, title')
|
||||
.eq('user_id', TEST_USER_ID)
|
||||
.eq('title', 'Plastic Hub');
|
||||
|
||||
if (locations && locations.length > 0) {
|
||||
found = true;
|
||||
logger.info('Locations found:', locations.length);
|
||||
expect(locations).toBeDefined();
|
||||
expect(locations.length).toBeGreaterThan(0);
|
||||
expect(locations[0].title).toBe('Plastic Hub');
|
||||
}
|
||||
attempts++;
|
||||
}
|
||||
|
||||
if (!found) {
|
||||
throw new Error('Timeout waiting for job to process and update DB');
|
||||
}
|
||||
|
||||
} catch (err: any) {
|
||||
logger.error({ err }, 'Error sending/processing job');
|
||||
throw err;
|
||||
} finally {
|
||||
if (jobId) {
|
||||
// Try to cancel/delete the job to clean up
|
||||
try {
|
||||
await boss.deleteJob(worker.queueName, jobId);
|
||||
logger.info(`Cleaned up job ${jobId}`);
|
||||
} catch (e) {
|
||||
logger.warn(`Failed to clean up job ${jobId}: ${e}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
}, 30000); // Increased timeout
|
||||
|
||||
it('should prevent duplicate jobs using singleton check and verify metadata', async () => {
|
||||
if (!boss) return;
|
||||
|
||||
const jobData = {
|
||||
query: 'duplicate-check',
|
||||
userId: '39d5210c-23b0-430e-bbfc-00b48f67c899',
|
||||
filters: {
|
||||
concurrency: 1
|
||||
},
|
||||
location: 'Nowhere'
|
||||
};
|
||||
|
||||
// First submission should succeed
|
||||
// First submission should succeed
|
||||
// const jobId1 = await submitJob(worker.queueName, jobData, {}, boss);
|
||||
const jobId1 = await boss.send(worker.queueName, jobData);
|
||||
expect(jobId1).toBeDefined();
|
||||
logger.info(`Singleton Test: First job submitted: ${jobId1}`);
|
||||
|
||||
// Verify metadata
|
||||
const job = await boss.getJobById(worker.queueName, jobId1!);
|
||||
expect(job).not.toBeNull();
|
||||
if (job) {
|
||||
expect(job.data).toBeDefined();
|
||||
// Check if metadata was injected (cost, timestamp)
|
||||
// Note: userId is already in jobData, but getMetadata adds cost and timestamp
|
||||
expect((job.data as any).cost).toBeDefined();
|
||||
expect((job.data as any).timestamp).toBeDefined();
|
||||
expect((job.data as any).userId).toBe(jobData.userId);
|
||||
|
||||
// Verify job options from constants
|
||||
// Priority Low = 10
|
||||
expect(job.priority).toBe(10);
|
||||
expect(job.retryLimit).toBe(5);
|
||||
|
||||
logger.info('Singleton Test: Metadata verification successful');
|
||||
} else {
|
||||
logger.warn('Singleton Test: Job not found (might be completed/archived)');
|
||||
}
|
||||
|
||||
// Second submission should fail (return null) because of singletonKey
|
||||
// Second submission should fail (return null) because of singletonKey
|
||||
// const jobId2 = await submitJob(worker.queueName, jobData, {}, boss);
|
||||
const jobId2 = await boss.send(worker.queueName, jobData);
|
||||
expect(jobId2).toBeNull();
|
||||
logger.info('Singleton Test: Duplicate job correctly rejected (returned null)');
|
||||
|
||||
// Clean up
|
||||
if (jobId1) {
|
||||
try {
|
||||
await boss.deleteJob(worker.queueName, jobId1);
|
||||
logger.info(`Singleton Test: Cleaned up job ${jobId1}`);
|
||||
} catch (e) {
|
||||
logger.warn(`Singleton Test: Failed to clean up job ${jobId1}: ${e}`);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
it('should resume cancelled jobs on product start', async () => {
|
||||
|
||||
const options = { retryDelay: 1, retryBackoff: true, retryDelayMax: 10 }
|
||||
const batchSize = 4
|
||||
/*
|
||||
await Promise.all([
|
||||
boss.send(worker.queueName, null, options),
|
||||
boss.send(worker.queueName, null, options),
|
||||
boss.send(worker.queueName, null, options),
|
||||
boss.send(worker.queueName, null, options)
|
||||
])*/
|
||||
|
||||
// const jobs = await boss.fetch(worker.queueName, { batchSize, includeMetadata: true })
|
||||
|
||||
// 1. Submit a job with delay so we can cancel it before it runs
|
||||
|
||||
const jobData = {
|
||||
query: TEST_QUERY,
|
||||
location: TEST_LOCATION,
|
||||
userId: TEST_USER_ID
|
||||
};
|
||||
|
||||
// Use a unique query to avoid singleton check (or ensure previous jobs are gone)
|
||||
// We can append timestamp to query
|
||||
jobData.query = `${TEST_QUERY}-${Date.now()}`;
|
||||
|
||||
// const jobId = await submitJob(boss, worker.queueName, jobData, { startAfter: 10 });
|
||||
const job1 = await boss.send(worker.queueName, LOCATIONS_JOB_OPTIONS, options);
|
||||
await boss.work(worker.queueName, async ([job]) => {
|
||||
debugger
|
||||
console.log(`received job ${job.id} with data ${JSON.stringify(job.data)}`)
|
||||
})
|
||||
const jobs2 = await boss.fetch(worker.queueName)
|
||||
const jobId = ''
|
||||
expect(jobId).toBeDefined();
|
||||
logger.info(`Resume Test: Job submitted: ${jobId}`);
|
||||
|
||||
|
||||
// 2. Cancel the job
|
||||
await boss.cancel(worker.queueName, jobId!);
|
||||
|
||||
// 3. Verify it is cancelled
|
||||
let job = await boss.getJobById(worker.queueName, jobId!);
|
||||
expect(job).not.toBeNull();
|
||||
expect(job!.state).toBe('cancelled');
|
||||
logger.info(`Resume Test: Job cancelled`);
|
||||
|
||||
// 4. Call LocationsProduct.start to resume
|
||||
const productModule = await import('../index.js');
|
||||
const LocationsProduct = productModule.LocationsProduct;
|
||||
if ((LocationsProduct as any).start) {
|
||||
await (LocationsProduct as any).start(boss);
|
||||
}
|
||||
|
||||
// 5. Verify it is resumed (state should be created or active)
|
||||
// Wait a bit for the resume to happen (it's async inside start)
|
||||
await new Promise(resolve => setTimeout(resolve, 1000));
|
||||
|
||||
job = await boss.getJobById(worker.queueName, jobId!);
|
||||
expect(job).not.toBeNull();
|
||||
logger.info(`Resume Test: Job state after resume: ${job!.state}`);
|
||||
expect(job!.state).not.toBe('cancelled');
|
||||
// It should probably be 'created' (waiting for startAfter?) or 'active' if startAfter was cleared?
|
||||
// pg-boss resume might reset it to created.
|
||||
expect(['created', 'active', 'completed']).toContain(job!.state);
|
||||
|
||||
// Clean up
|
||||
if (jobId) {
|
||||
try {
|
||||
await boss.deleteJob(worker.queueName, jobId);
|
||||
} catch (e) {
|
||||
// ignore
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
19
packages/shared/src/server/locations/cache.ts
Normal file
19
packages/shared/src/server/locations/cache.ts
Normal file
@ -0,0 +1,19 @@
|
||||
import { createHash } from 'crypto';
|
||||
import { getSearchByHash, getLocationsByPlaceIds } from './db.js';
|
||||
|
||||
export const params_hash = (params: any) => {
|
||||
// Normalize input for hashing
|
||||
// Sort keys to ensure consistent hash
|
||||
const normalizedInput = JSON.stringify(params, Object.keys(params).sort());
|
||||
return createHash('sha256').update(normalizedInput).digest('hex');
|
||||
};
|
||||
|
||||
export const get_cached = async (inputHash: string) => {
|
||||
const searchData = await getSearchByHash(inputHash);
|
||||
|
||||
if (searchData && searchData.result_place_ids) {
|
||||
const locations = await getLocationsByPlaceIds(searchData.result_place_ids);
|
||||
return locations;
|
||||
}
|
||||
return null;
|
||||
};
|
||||
37
packages/shared/src/server/locations/constants.ts
Normal file
37
packages/shared/src/server/locations/constants.ts
Normal file
@ -0,0 +1,37 @@
|
||||
// Locations Constants
|
||||
export const COST_PER_SEARCH = parseInt(process.env.LOCATIONS_COST || '1');
|
||||
export const TIMEOUT_MS_LOCATIONS = parseInt(process.env.LOCATIONS_TIMEOUT_MS || '300000');
|
||||
export const MAX_CONCURRENCY_LOCATIONS = 1; // Kept hardcoded as it wasn't requested to be moved, but could be.
|
||||
|
||||
export const LOCATIONS_JOB_OPTIONS = {
|
||||
expireInSeconds: parseInt(process.env.LOCATIONS_JOB_EXPIRE_SECONDS || '600'),
|
||||
retryBackoff: true,
|
||||
retryDelayMax: parseInt(process.env.LOCATIONS_JOB_RETRY_DELAY_MAX || '1000')
|
||||
};
|
||||
|
||||
// Email Constants
|
||||
export const EMAIL_SEARCH_COST = parseInt(process.env.EMAIL_SEARCH_COST || '5');
|
||||
export const EMAIL_MAX_BATCH_SIZE = parseInt(process.env.MAX_PLACES_PER_REQUEST || '100');
|
||||
|
||||
export const EMAIL_META_TIMEOUT_MS = parseInt(process.env.EMAIL_META_TIMEOUT_SECONDS || '30') * 1000;
|
||||
export const EMAIL_SEARCH_TIMEOUT_MS = parseInt(process.env.EMAIL_SEARCH_TIMEOUT_SECONDS || '60') * 1000;
|
||||
export const EMAIL_SEARCH_PAGE_TIMEOUT_MS = parseInt(process.env.EMAIL_SEARCH_PAGE_TIMEOUT_SECONDS || '10') * 1000;
|
||||
export const EMAIL_SEARCH_MAX_PAGES = parseInt(process.env.EMAIL_SEARCH_MAX_PAGES || '15');
|
||||
export const EMAIL_SEARCH_PAGE_CONCURRENCY = parseInt(process.env.EMAIL_SEARCH_PAGE_CONCURRENCY || '2');
|
||||
export const EMAIL_STREAM_TIMEOUT_MS = parseInt(process.env.EMAIL_STREAM_TIMEOUT_SECONDS || '300') * 1000;
|
||||
export const EMAIL_MAX_CONCURRENT_JOBS = parseInt(process.env.EMAIL_MAX_CONCURRENT_JOBS || '5');
|
||||
|
||||
export const JOB_NAME = 'locations-find';
|
||||
export const EMAIL_JOB_NAME = 'email-find';
|
||||
|
||||
export enum JobPriority {
|
||||
Low = 10,
|
||||
Medium = 20,
|
||||
High = 30
|
||||
}
|
||||
|
||||
export const EMAIL_JOB_OPTIONS = {
|
||||
priority: JobPriority.Medium,
|
||||
retryLimit: parseInt(process.env.EMAIL_JOB_RETRY_LIMIT || '1'),
|
||||
expireInSeconds: parseInt(process.env.EMAIL_JOB_EXPIRE_SECONDS || '60'),
|
||||
};
|
||||
71
packages/shared/src/server/locations/db.ts
Normal file
71
packages/shared/src/server/locations/db.ts
Normal file
@ -0,0 +1,71 @@
|
||||
import { supabase } from '@/commons/supabase.js';
|
||||
import { logger } from './logger.js';
|
||||
|
||||
|
||||
export const getSearchByHash = async (inputHash: string) => {
|
||||
const { data, error } = await supabase
|
||||
.from('searches')
|
||||
.select('result_place_ids')
|
||||
.eq('input_hash', inputHash)
|
||||
.single();
|
||||
|
||||
if (error && error.code !== 'PGRST116') { // PGRST116 is "The result contains 0 rows"
|
||||
logger.error({ err: error }, 'Error fetching search by hash');
|
||||
}
|
||||
return data;
|
||||
};
|
||||
|
||||
export const getLocationsByPlaceIds = async (placeIds: string[]) => {
|
||||
if (placeIds.length === 0) return [];
|
||||
const { data, error } = await supabase
|
||||
.from('locations')
|
||||
.select('*')
|
||||
.in('place_id', placeIds);
|
||||
|
||||
if (error) {
|
||||
logger.error({ err: error }, 'Error fetching locations');
|
||||
throw error;
|
||||
}
|
||||
return data;
|
||||
};
|
||||
|
||||
export const getLocationsMeta = async (placeIds: string[]) => {
|
||||
if (placeIds.length === 0) return [];
|
||||
const { data, error } = await supabase
|
||||
.from('locations')
|
||||
.select('place_id, meta')
|
||||
.in('place_id', placeIds);
|
||||
|
||||
if (error) {
|
||||
logger.error({ err: error }, 'Error fetching locations meta');
|
||||
return [];
|
||||
}
|
||||
return data;
|
||||
}
|
||||
|
||||
export const upsertLocations = async (locations: any[]) => {
|
||||
if (locations.length === 0) return;
|
||||
const { error } = await supabase
|
||||
.from('locations')
|
||||
.upsert(locations, { onConflict: 'place_id' });
|
||||
|
||||
if (error) {
|
||||
logger.error({ err: error }, 'Error upserting locations ');
|
||||
throw error;
|
||||
}
|
||||
};
|
||||
|
||||
export const storeSearch = async (inputHash: string, inputParams: any, placeIds: string[]) => {
|
||||
const { error } = await supabase
|
||||
.from('searches')
|
||||
.upsert({
|
||||
input_hash: inputHash,
|
||||
input_params: inputParams,
|
||||
result_place_ids: placeIds,
|
||||
created_at: new Date().toISOString()
|
||||
}, { onConflict: 'input_hash' });
|
||||
|
||||
if (error) {
|
||||
logger.error({ err: error }, 'Error storing search');
|
||||
}
|
||||
};
|
||||
@ -0,0 +1,27 @@
|
||||
import { describe, it, expect, vi, beforeEach } from 'vitest';
|
||||
import { MetaEnricher } from '../meta.js';
|
||||
|
||||
// Mock axios and puppeteer if necessary, but for basic unit testing we might just want to
|
||||
// check the interface or mock the private methods if we could.
|
||||
// Since parseHtml is private, we test enrich().
|
||||
|
||||
describe('MetaEnricher', () => {
|
||||
let enricher: MetaEnricher;
|
||||
|
||||
beforeEach(() => {
|
||||
enricher = new MetaEnricher();
|
||||
});
|
||||
|
||||
it('should have correct name and type', () => {
|
||||
expect(enricher.name).toBe('meta');
|
||||
expect(enricher.type).toBe('meta');
|
||||
});
|
||||
|
||||
it('should return empty object if no website', async () => {
|
||||
const result = await enricher.enrich({ place_id: '1', title: 'test' } as any, { userId: 'u1' });
|
||||
expect(result).toEqual({});
|
||||
});
|
||||
|
||||
// Deeper testing requires mocking parseHtml logic which uses external libs.
|
||||
// For this scope, ensuring it handles basic input covers the wiring.
|
||||
});
|
||||
@ -0,0 +1,33 @@
|
||||
import { describe, it, expect, beforeEach } from 'vitest';
|
||||
import { EnricherRegistry, IEnricher, EnrichmentContext } from '../registry.js';
|
||||
import { CompetitorFull } from '@polymech/shared';
|
||||
|
||||
class MockEnricher implements IEnricher {
|
||||
name = 'mock';
|
||||
type = 'test';
|
||||
async enrich(location: CompetitorFull, context: EnrichmentContext) {
|
||||
return { title: 'enriched' };
|
||||
}
|
||||
}
|
||||
|
||||
describe('EnricherRegistry', () => {
|
||||
// Clear registry before each test if possible, but map is static.
|
||||
// We can rely on unique names or just testing existence.
|
||||
|
||||
it('should register and retrieve an enricher', () => {
|
||||
const enricher = new MockEnricher();
|
||||
EnricherRegistry.register('test-mock', enricher);
|
||||
|
||||
expect(EnricherRegistry.get('test-mock')).toBe(enricher);
|
||||
});
|
||||
|
||||
it('should return undefined for unknown enricher', () => {
|
||||
expect(EnricherRegistry.get('unknown')).toBeUndefined();
|
||||
});
|
||||
|
||||
it('should list all enrichers', () => {
|
||||
const initialCount = EnricherRegistry.getAll().length;
|
||||
EnricherRegistry.register('test-mock-2', new MockEnricher());
|
||||
expect(EnricherRegistry.getAll().length).toBeGreaterThanOrEqual(1);
|
||||
});
|
||||
});
|
||||
306
packages/shared/src/server/locations/enrichers/meta.ts
Normal file
306
packages/shared/src/server/locations/enrichers/meta.ts
Normal file
@ -0,0 +1,306 @@
|
||||
import { CompetitorFull } from '@polymech/shared';
|
||||
import { IEnricher, EnrichmentContext } from './registry.js';
|
||||
import axios, { AxiosRequestConfig } from 'axios';
|
||||
import * as cheerio from 'cheerio';
|
||||
import { URL } from 'url';
|
||||
import puppeteer from 'puppeteer';
|
||||
import puppeteerExtra from 'puppeteer-extra';
|
||||
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
||||
import https from 'https';
|
||||
import { logger } from '@polymech/search';
|
||||
|
||||
const puppeteerExtraAny = puppeteerExtra as any;
|
||||
puppeteerExtraAny.use(StealthPlugin());
|
||||
|
||||
interface Og {
|
||||
[key: string]: string | undefined;
|
||||
}
|
||||
|
||||
interface Meta {
|
||||
[key: string]: string | undefined;
|
||||
}
|
||||
|
||||
interface Image {
|
||||
src: string;
|
||||
}
|
||||
|
||||
interface PageType {
|
||||
url: string;
|
||||
source: string;
|
||||
status: string;
|
||||
}
|
||||
|
||||
interface Structured {
|
||||
[key: string]: any;
|
||||
}
|
||||
|
||||
interface LocationSiteMeta {
|
||||
title?: string;
|
||||
description?: string;
|
||||
image?: string;
|
||||
url?: string;
|
||||
social?: PageType[];
|
||||
seo?: {
|
||||
keywords?: string[];
|
||||
structured?: Structured[];
|
||||
og?: Og;
|
||||
metaTags?: Meta;
|
||||
};
|
||||
pages?: PageType[];
|
||||
externalLinks?: PageType[];
|
||||
images?: Image[];
|
||||
}
|
||||
|
||||
export class MetaEnricher implements IEnricher {
|
||||
name = 'meta';
|
||||
type = 'meta';
|
||||
|
||||
async enrich(location: CompetitorFull, context: EnrichmentContext): Promise<Partial<CompetitorFull>> {
|
||||
if (!context.logger) {
|
||||
context.logger = logger;
|
||||
}
|
||||
if (!location.website) {
|
||||
if (context.logger) context.logger.debug({ placeId: location.place_id }, 'No website to enrich');
|
||||
return {};
|
||||
}
|
||||
|
||||
try {
|
||||
if (context.logger) context.logger.info({ website: location.website }, 'Starting meta enrichment');
|
||||
const meta = await this.parseHtml(location.website, null, { headless: true }); // Default to headless puppeteer for better success rate
|
||||
|
||||
const updates: Partial<CompetitorFull> = {
|
||||
raw_data: {
|
||||
...location.raw_data,
|
||||
meta: meta
|
||||
}
|
||||
};
|
||||
|
||||
if (context.logger) {
|
||||
const socialCount = meta.social?.length || 0;
|
||||
const emailCount = (meta.seo?.structured || []).length; // structured data count, rough proxy
|
||||
context.logger.info({
|
||||
website: location.website,
|
||||
socials: meta.social?.map(s => s.source),
|
||||
title: meta.title ? 'Found' : 'Missing',
|
||||
emails: meta.seo?.metaTags?.['email'] || 'None'
|
||||
}, 'Meta enrichment complete');
|
||||
}
|
||||
|
||||
// Extract social links to main object if not present
|
||||
if (meta.social) {
|
||||
// We can't directly assign to unmapped fields if schema doesn't exist yet,
|
||||
// but implementation plan said 'schemas.ts' might be updated.
|
||||
// For now, sticking to raw_data.meta is safe, but we can also return them
|
||||
// and let the caller/schema handle it.
|
||||
}
|
||||
|
||||
return updates;
|
||||
} catch (error: any) {
|
||||
if (context.logger) {
|
||||
context.logger.error(`Error enriching meta for ${location.website}: ${error.message}`);
|
||||
}
|
||||
return {}; // Return empty update on failure
|
||||
}
|
||||
}
|
||||
|
||||
private isValidUrl(url: string) {
|
||||
try {
|
||||
new URL(url);
|
||||
return true;
|
||||
} catch (error) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
private readMetaTags($: cheerio.CheerioAPI, name: string) {
|
||||
return $(`meta[name="${name}"]`).attr('content') || $(`meta[property="${name}"]`).attr('content') || null;
|
||||
}
|
||||
|
||||
private static browserPromise: Promise<puppeteer.Browser> | null = null;
|
||||
private static idleTimer: NodeJS.Timeout | null = null;
|
||||
private static IDLE_TIMEOUT_SECONDS = parseInt(process.env.ENRICHER_META_IDLE_TIMEOUT || '60');
|
||||
|
||||
private static resetIdleTimer() {
|
||||
if (MetaEnricher.idleTimer) clearTimeout(MetaEnricher.idleTimer);
|
||||
MetaEnricher.idleTimer = setTimeout(async () => {
|
||||
if (MetaEnricher.browserPromise) {
|
||||
// context.logger.info(`[Puppeteer] Browser idle timeout (${60}s) reached` // No Logger context here, use console or ignore
|
||||
const browser = await MetaEnricher.browserPromise;
|
||||
await browser.close();
|
||||
MetaEnricher.browserPromise = null;
|
||||
}
|
||||
}, MetaEnricher.IDLE_TIMEOUT_SECONDS * 1000);
|
||||
}
|
||||
|
||||
private static async getBrowser(): Promise<puppeteer.Browser> {
|
||||
MetaEnricher.resetIdleTimer();
|
||||
if (MetaEnricher.browserPromise) return MetaEnricher.browserPromise;
|
||||
|
||||
logger.info(`[Puppeteer] Launching new browser`);
|
||||
MetaEnricher.browserPromise = (async () => {
|
||||
const browser = await puppeteerExtraAny.launch({
|
||||
headless: "new",
|
||||
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage']
|
||||
});
|
||||
return browser;
|
||||
})();
|
||||
|
||||
return MetaEnricher.browserPromise;
|
||||
}
|
||||
|
||||
private async parseHtml(url: string, config: AxiosRequestConfig | null, options: any): Promise<LocationSiteMeta> {
|
||||
if (!/(^http(s?):\/\/[^\s$.?#].[^\s]*)/i.test(url)) return {} as LocationSiteMeta;
|
||||
|
||||
let content = '';
|
||||
|
||||
if (options && options.headless) {
|
||||
try {
|
||||
const browser = await MetaEnricher.getBrowser();
|
||||
const page = await browser.newPage();
|
||||
try {
|
||||
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
|
||||
|
||||
// Allow shorter timeout for meta tags
|
||||
const timeout = options.timeout || 15000;
|
||||
|
||||
await page.goto(url, { waitUntil: 'domcontentloaded', timeout });
|
||||
content = await page.content();
|
||||
} finally {
|
||||
await page.close();
|
||||
MetaEnricher.resetIdleTimer();
|
||||
}
|
||||
} catch (e: any) {
|
||||
// Fallback to axios if puppeteer fails (or specific connection/timeout error)
|
||||
// console.error(`Puppeteer failed for ${url}: ${e.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
if (!content) {
|
||||
try {
|
||||
const { data } = await axios(url, {
|
||||
...config,
|
||||
httpsAgent: new https.Agent({ rejectUnauthorized: false }),
|
||||
headers: {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
||||
},
|
||||
timeout: 10000
|
||||
});
|
||||
content = data;
|
||||
} catch (e: any) {
|
||||
// logger.error(`Axios failed for ${url}: ${e.message}`);
|
||||
return {} as LocationSiteMeta;
|
||||
}
|
||||
}
|
||||
|
||||
const $ = cheerio.load(content);
|
||||
const og: Og = {};
|
||||
const meta: Meta = {};
|
||||
const images: Image[] = [];
|
||||
const links: string[] = [];
|
||||
let allLinks: string[] = [];
|
||||
|
||||
const title = $('title').text();
|
||||
if (title) meta.title = title;
|
||||
|
||||
const canonical = $('link[rel=canonical]').attr('href');
|
||||
if (canonical) meta.url = canonical;
|
||||
|
||||
['title', 'description', 'image'].forEach(s => {
|
||||
const val = this.readMetaTags($, s);
|
||||
if (val) meta[s] = val;
|
||||
});
|
||||
|
||||
['og:title', 'og:description', 'og:image', 'og:url', 'og:site_name', 'og:type'].forEach(s => {
|
||||
const val = this.readMetaTags($, s);
|
||||
if (val) og[s.split(':')[1]] = val;
|
||||
});
|
||||
|
||||
$('img').each((i: number, el: any) => {
|
||||
let src = $(el).attr('src');
|
||||
if (src) {
|
||||
try {
|
||||
src = new URL(src, url).href;
|
||||
images.push({ src });
|
||||
} catch (e) {
|
||||
// ignore invalid urls
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
const jsonLdArray: Structured[] = [];
|
||||
$('script[type="application/ld+json"]').each((_: number, element: any) => {
|
||||
const jsonLdContent = $(element).html();
|
||||
if (jsonLdContent) {
|
||||
try {
|
||||
const jsonData = JSON.parse(jsonLdContent);
|
||||
jsonLdArray.push(jsonData);
|
||||
} catch (e) {
|
||||
// logger.error(`Error parsing JSON-LD: ${e.message} @ ${url}`);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
$('a').each((index: number, element: any) => {
|
||||
let href = $(element).attr('href');
|
||||
if (href) {
|
||||
try {
|
||||
href = new URL(href, url).href;
|
||||
|
||||
if (this.isValidUrl(href)) {
|
||||
if (href.indexOf('contact') !== -1 && !links.includes(href)) {
|
||||
links.push(href);
|
||||
}
|
||||
allLinks.push(href);
|
||||
}
|
||||
} catch (e) {
|
||||
// Ignore invalid URLs
|
||||
}
|
||||
}
|
||||
});
|
||||
allLinks = [...new Set(allLinks)];
|
||||
|
||||
const socialLinks: PageType[] = [];
|
||||
const internalPages: PageType[] = [];
|
||||
const externalLinks: PageType[] = [];
|
||||
|
||||
allLinks.forEach(link => {
|
||||
if (link.includes('instagram.com')) socialLinks.push({ url: link, source: 'instagram', status: 'PENDING' });
|
||||
else if (link.includes('facebook.com')) socialLinks.push({ url: link, source: 'facebook', status: 'PENDING' });
|
||||
else if (link.includes('linkedin.com')) socialLinks.push({ url: link, source: 'linkedin', status: 'PENDING' });
|
||||
else if (link.includes('youtube.com')) socialLinks.push({ url: link, source: 'youtube', status: 'PENDING' });
|
||||
else if (link.includes('twitter.com')) socialLinks.push({ url: link, source: 'twitter', status: 'PENDING' });
|
||||
else if (link.includes('mailto:')) { /* ignore mailto */ }
|
||||
else {
|
||||
try {
|
||||
const baseUrl = new URL(url).hostname;
|
||||
const linkUrl = new URL(link).hostname;
|
||||
if (linkUrl === baseUrl || linkUrl.endsWith('.' + baseUrl)) {
|
||||
internalPages.push({ url: link, source: 'site', status: 'PENDING' });
|
||||
} else {
|
||||
externalLinks.push({ url: link, source: 'external', status: 'PENDING' });
|
||||
}
|
||||
} catch (e) {
|
||||
externalLinks.push({ url: link, source: 'external', status: 'PENDING' });
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
return {
|
||||
title: meta.title || og.title,
|
||||
description: meta.description || og.description,
|
||||
image: meta.image || og.image,
|
||||
url: meta.url || og.url || url,
|
||||
social: socialLinks,
|
||||
seo: {
|
||||
keywords: ($('meta[property="og:keywords"]').attr("content") ||
|
||||
$('meta[name="keywords"]').attr("content") || "").split(',').map((s: any) => s.trim()).filter((s: any) => s),
|
||||
structured: jsonLdArray,
|
||||
og,
|
||||
metaTags: meta
|
||||
},
|
||||
pages: internalPages,
|
||||
externalLinks: externalLinks,
|
||||
images
|
||||
};
|
||||
}
|
||||
}
|
||||
34
packages/shared/src/server/locations/enrichers/registry.ts
Normal file
34
packages/shared/src/server/locations/enrichers/registry.ts
Normal file
@ -0,0 +1,34 @@
|
||||
import { CompetitorFull } from '@polymech/shared';
|
||||
|
||||
export interface EnrichmentContext {
|
||||
userId: string;
|
||||
logger?: any;
|
||||
}
|
||||
|
||||
export interface IEnricher {
|
||||
name: string;
|
||||
type: 'meta' | 'email' | string;
|
||||
|
||||
/**
|
||||
* Enrich a single location.
|
||||
* @param location The partial competitor data available
|
||||
* @param context Execution context
|
||||
*/
|
||||
enrich(location: CompetitorFull, context: EnrichmentContext): Promise<Partial<CompetitorFull>>;
|
||||
}
|
||||
|
||||
export class EnricherRegistry {
|
||||
private static enrichers: Map<string, IEnricher> = new Map();
|
||||
|
||||
static register(name: string, enricher: IEnricher) {
|
||||
this.enrichers.set(name, enricher);
|
||||
}
|
||||
|
||||
static get(name: string): IEnricher | undefined {
|
||||
return this.enrichers.get(name);
|
||||
}
|
||||
|
||||
static getAll(): IEnricher[] {
|
||||
return Array.from(this.enrichers.values());
|
||||
}
|
||||
}
|
||||
135
packages/shared/src/server/locations/enrichers/service.ts
Normal file
135
packages/shared/src/server/locations/enrichers/service.ts
Normal file
@ -0,0 +1,135 @@
|
||||
import { EnricherRegistry } from './registry.js';
|
||||
import { getLocationsByPlaceIds, upsertLocations } from '../db.js';
|
||||
import { logger } from '../logger.js';
|
||||
import pMap from 'p-map';
|
||||
|
||||
export interface StreamInterface {
|
||||
writeSSE(data: { event: string; data: string }): Promise<void>;
|
||||
}
|
||||
|
||||
export class EnrichmentService {
|
||||
|
||||
/**
|
||||
* Orchestrates the enrichment process for a list of places and enrichers.
|
||||
* Optionally streams progress and updates if a stream is provided.
|
||||
*/
|
||||
async enrichAndStream(
|
||||
placeIds: string[],
|
||||
enricherNames: string[],
|
||||
userId: string,
|
||||
stream?: StreamInterface
|
||||
) {
|
||||
const requestedEnrichers = enricherNames.map(name => EnricherRegistry.get(name)).filter(e => e !== undefined);
|
||||
|
||||
if (requestedEnrichers.length === 0) {
|
||||
throw new Error('No valid enrichers found');
|
||||
}
|
||||
|
||||
if (stream) {
|
||||
await stream.writeSSE({
|
||||
event: 'progress',
|
||||
data: JSON.stringify({
|
||||
current: 0,
|
||||
total: placeIds.length,
|
||||
message: `Starting enrichment for ${placeIds.length} items`
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
// Fetch current data
|
||||
const existingLocations = await getLocationsByPlaceIds(placeIds);
|
||||
const locationMap = new Map(existingLocations.map((l: any) => [l.place_id, l]));
|
||||
|
||||
// ... class definition ...
|
||||
|
||||
let processed = 0;
|
||||
let successCount = 0;
|
||||
let failedCount = 0;
|
||||
|
||||
await pMap(placeIds, async (placeId: string) => {
|
||||
const location = locationMap.get(placeId);
|
||||
if (!location) {
|
||||
logger.warn({ placeId }, 'Location not found for enrichment');
|
||||
failedCount++;
|
||||
return;
|
||||
}
|
||||
|
||||
let updates: any = {};
|
||||
for (const enricher of requestedEnrichers) {
|
||||
try {
|
||||
// Enricher handles its own timeouts/shared resources
|
||||
const result = await enricher!.enrich(location, { userId, logger });
|
||||
if (Object.keys(result).length > 0) {
|
||||
logger.info({ placeId, enricher: enricher!.name, keys: Object.keys(result) }, 'Enricher found data');
|
||||
} else {
|
||||
logger.info({ placeId, enricher: enricher!.name }, 'Enricher found no new data');
|
||||
}
|
||||
updates = { ...updates, ...result };
|
||||
} catch (e: any) {
|
||||
logger.error({ placeId, enricher: enricher!.name, err: e }, 'Enricher failed');
|
||||
}
|
||||
}
|
||||
|
||||
if (Object.keys(updates).length > 0) {
|
||||
const updatedLocation = {
|
||||
...location,
|
||||
...updates,
|
||||
raw_data: {
|
||||
...(location.raw_data || {}),
|
||||
...(updates.raw_data || {})
|
||||
}
|
||||
};
|
||||
|
||||
try {
|
||||
logger.info({ placeId, updates }, 'Updating location in db');
|
||||
await upsertLocations([updatedLocation]);
|
||||
successCount++;
|
||||
|
||||
if (stream) {
|
||||
await stream.writeSSE({
|
||||
event: 'enrichment-update',
|
||||
data: JSON.stringify({
|
||||
place_id: placeId,
|
||||
updates: updates
|
||||
})
|
||||
});
|
||||
}
|
||||
} catch (e: any) {
|
||||
logger.error({ placeId, err: e }, 'Failed to persist enrichment updates');
|
||||
failedCount++;
|
||||
}
|
||||
} else {
|
||||
// No updates found, but technically processed successfully without error
|
||||
successCount++;
|
||||
}
|
||||
|
||||
processed++;
|
||||
if (stream) {
|
||||
await stream.writeSSE({
|
||||
event: 'progress',
|
||||
data: JSON.stringify({
|
||||
current: processed,
|
||||
total: placeIds.length,
|
||||
message: `Enriched ${processed} of ${placeIds.length}`
|
||||
})
|
||||
});
|
||||
}
|
||||
}, { concurrency: parseInt(process.env.ENRICHER_META_CONCURRENCY || '5') });
|
||||
|
||||
|
||||
if (stream) {
|
||||
await stream.writeSSE({
|
||||
event: 'complete',
|
||||
data: JSON.stringify({
|
||||
processed,
|
||||
success: successCount,
|
||||
failed: failedCount
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
return { processed, success: successCount, failed: failedCount };
|
||||
}
|
||||
}
|
||||
|
||||
export const enrichmentService = new EnrichmentService();
|
||||
302
packages/shared/src/server/locations/gadm_wrapper.py
Normal file
302
packages/shared/src/server/locations/gadm_wrapper.py
Normal file
@ -0,0 +1,302 @@
|
||||
|
||||
import sys
|
||||
import json
|
||||
import argparse
|
||||
import pygadm
|
||||
import traceback
|
||||
import hashlib
|
||||
import os
|
||||
|
||||
CACHE_DIR = os.environ.get('GADM_CACHE', './cache/gadm')
|
||||
|
||||
def get_cache_path(prefix, value):
|
||||
hash_object = hashlib.md5(value.encode('utf-8'))
|
||||
hash_hex = hash_object.hexdigest()
|
||||
return os.path.join(CACHE_DIR, f"{prefix}_{hash_hex}.json")
|
||||
|
||||
def read_cache(path):
|
||||
if os.path.exists(path):
|
||||
try:
|
||||
with open(path, 'r', encoding='utf-8') as f:
|
||||
return json.load(f)
|
||||
except:
|
||||
return None
|
||||
return None
|
||||
|
||||
def write_cache(path, data):
|
||||
try:
|
||||
os.makedirs(os.path.dirname(path), exist_ok=True)
|
||||
with open(path, 'w', encoding='utf-8') as f:
|
||||
json.dump(data, f)
|
||||
except Exception as e:
|
||||
sys.stderr.write(f"Cache write failed: {e}\n")
|
||||
|
||||
def search_regions(query, content_level=None, geojson=False, admin=None):
|
||||
path = get_cache_path(f"search_{content_level if content_level else 'all'}_{'geo' if geojson else 'meta'}_{admin if admin else 'all'}", query)
|
||||
cached = read_cache(path)
|
||||
if cached:
|
||||
print(json.dumps(cached, default=str))
|
||||
return
|
||||
|
||||
try:
|
||||
# If geojson is requested, we need pygadm.Items to get the geometry
|
||||
if geojson:
|
||||
# NOTE: This might still fail if name is ambiguous.
|
||||
# Ideally the UI should search first (getting GIDs), then request boundary by ID.
|
||||
# If a user requests search+geojson by name, they better be specific.
|
||||
kwargs = {'name': [query]}
|
||||
if content_level:
|
||||
kwargs['content_level'] = int(content_level)
|
||||
if admin:
|
||||
kwargs['admin'] = admin
|
||||
|
||||
try:
|
||||
gdf = pygadm.Items(**kwargs)
|
||||
except Exception as e:
|
||||
# Check if query contains comma
|
||||
if ',' in query:
|
||||
first_part = query.split(',')[0].strip()
|
||||
kwargs['name'] = [first_part]
|
||||
try:
|
||||
gdf = pygadm.Items(**kwargs)
|
||||
except:
|
||||
print(json.dumps({"data": []}))
|
||||
return
|
||||
else:
|
||||
raise e
|
||||
else:
|
||||
# Metadata search -> Use pygadm.Names to find all matches (handles ambiguity)
|
||||
kwargs = {'name': query}
|
||||
if content_level:
|
||||
kwargs['content_level'] = int(content_level)
|
||||
if admin:
|
||||
kwargs['admin'] = admin
|
||||
|
||||
try:
|
||||
gdf = pygadm.Names(**kwargs)
|
||||
except Exception as e:
|
||||
# Check if query contains comma
|
||||
if ',' in query:
|
||||
# Try searching for the first part
|
||||
first_part = query.split(',')[0].strip()
|
||||
kwargs['name'] = first_part
|
||||
try:
|
||||
gdf = pygadm.Names(**kwargs)
|
||||
except:
|
||||
# Still failed, raise original error or return empty?
|
||||
# Let's print empty for consistent client handling
|
||||
print(json.dumps({"data": []}))
|
||||
return
|
||||
else:
|
||||
# No comma, propagate empty result
|
||||
print(json.dumps({"data": []}))
|
||||
return
|
||||
|
||||
if gdf is None or gdf.empty:
|
||||
print(json.dumps({"data": []}))
|
||||
return
|
||||
|
||||
# Convert to list of dicts for JSON serialization
|
||||
# safe stringify helper
|
||||
def safe_str(v):
|
||||
return str(v) if v is not None else ""
|
||||
|
||||
if geojson:
|
||||
# Full GeoJSON conversion with manual fallback
|
||||
features = []
|
||||
cols_props = [c for c in gdf.columns if c != 'geometry']
|
||||
|
||||
for index, row in gdf.iterrows():
|
||||
properties = {c: safe_str(row[c]) for c in cols_props}
|
||||
try:
|
||||
geometry = row['geometry'].__geo_interface__ if row['geometry'] else None
|
||||
except:
|
||||
geometry = None
|
||||
|
||||
features.append({
|
||||
"type": "Feature",
|
||||
"properties": properties,
|
||||
"geometry": geometry
|
||||
})
|
||||
|
||||
output = {
|
||||
"type": "FeatureCollection",
|
||||
"features": features
|
||||
}
|
||||
else:
|
||||
# Metadata only (from Names or Items)
|
||||
results = []
|
||||
cols = gdf.columns.tolist()
|
||||
for index, row in gdf.iterrows():
|
||||
r = {}
|
||||
for c in cols:
|
||||
r[c] = safe_str(row[c])
|
||||
results.append(r)
|
||||
output = {"data": results}
|
||||
|
||||
write_cache(path, output)
|
||||
print(json.dumps(output, default=str))
|
||||
|
||||
except Exception as e:
|
||||
print(json.dumps({"error": traceback.format_exc()}))
|
||||
sys.exit(1)
|
||||
|
||||
def get_boundary(gadm_id, name, content_level):
|
||||
path = get_cache_path("boundary", gadm_id)
|
||||
cached = read_cache(path)
|
||||
if cached:
|
||||
print(json.dumps(cached, default=str))
|
||||
return
|
||||
|
||||
try:
|
||||
# Use admin code (GID) directly to avoid ambiguity
|
||||
# pygadm.Items supports 'admin' to fetch by GID
|
||||
kwargs = {'admin': [gadm_id]}
|
||||
|
||||
# content_level is not strictly needed if GID is specific,
|
||||
# but passing it doesn't hurt if we want to be explicit,
|
||||
# however 'admin' usually overrides or implies level.
|
||||
# Let's rely on GID as the primary key.
|
||||
|
||||
gdf = pygadm.Items(**kwargs)
|
||||
|
||||
if gdf is None or gdf.empty:
|
||||
print(json.dumps({"error": "Region not found"}))
|
||||
sys.exit(1)
|
||||
|
||||
# Convert to GeoJSON manually to avoid pandas errors
|
||||
features = []
|
||||
cols_props = [c for c in gdf.columns if c != 'geometry']
|
||||
|
||||
# Helper for safe string conversion
|
||||
def safe_str(v):
|
||||
return str(v) if v is not None else ""
|
||||
|
||||
for index, row in gdf.iterrows():
|
||||
properties = {c: safe_str(row[c]) for c in cols_props}
|
||||
try:
|
||||
geometry = row['geometry'].__geo_interface__ if row['geometry'] else None
|
||||
except:
|
||||
geometry = None
|
||||
|
||||
features.append({
|
||||
"type": "Feature",
|
||||
"properties": properties,
|
||||
"geometry": geometry
|
||||
})
|
||||
|
||||
output = {
|
||||
"type": "FeatureCollection",
|
||||
"features": features
|
||||
}
|
||||
|
||||
write_cache(path, output)
|
||||
print(json.dumps(output, default=str))
|
||||
|
||||
except Exception as e:
|
||||
print(json.dumps({"error": str(e)}))
|
||||
sys.exit(1)
|
||||
|
||||
def get_names(admin, content_level=None, depth=1):
|
||||
path = get_cache_path(f"names_{admin}_{content_level}_{depth}", "names")
|
||||
cached = read_cache(path)
|
||||
if cached:
|
||||
print(json.dumps(cached, default=str))
|
||||
return
|
||||
|
||||
try:
|
||||
all_results = []
|
||||
|
||||
# Determine starting level
|
||||
# If content_level is explicit, start there.
|
||||
# Else derive from admin code: e.g. "ESP.6_1" (1 dot) -> Level 1. Next is 2.
|
||||
# "ESP" (0 dots) -> Level 0. Next is 1.
|
||||
if content_level:
|
||||
start_level = int(content_level)
|
||||
else:
|
||||
# GID format: XXX.L1.L2_1
|
||||
# Count dots:
|
||||
# ESP -> 0 -> start 1
|
||||
# ESP.1_1 -> 1 -> start 2
|
||||
start_level = admin.count('.') + 1
|
||||
|
||||
# Handle infinite depth (passed as -1 or similar string)
|
||||
if str(depth).lower() in ['infinite', 'inf', '-1']:
|
||||
limit = 5 # Reasonable cap for GADM (rarely goes beyond level 5)
|
||||
else:
|
||||
limit = int(depth)
|
||||
|
||||
current_level = start_level
|
||||
|
||||
for i in range(limit):
|
||||
# Fetch names for this level
|
||||
kwargs = {'admin': admin, 'content_level': current_level}
|
||||
|
||||
try:
|
||||
gdf = pygadm.Names(**kwargs)
|
||||
except:
|
||||
# Likely level doesn't exist
|
||||
break
|
||||
|
||||
if gdf is None or gdf.empty:
|
||||
# No more levels
|
||||
break
|
||||
|
||||
# Convert to list of dicts
|
||||
cols = gdf.columns.tolist()
|
||||
def safe_str(v):
|
||||
return str(v) if v is not None else ""
|
||||
|
||||
for index, row in gdf.iterrows():
|
||||
r = {}
|
||||
for c in cols:
|
||||
r[c] = safe_str(row[c])
|
||||
all_results.append(r)
|
||||
|
||||
current_level += 1
|
||||
|
||||
output = {"data": all_results}
|
||||
write_cache(path, output)
|
||||
print(json.dumps(output, default=str))
|
||||
|
||||
except Exception as e:
|
||||
print(json.dumps({"error": traceback.format_exc()}))
|
||||
sys.exit(1)
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='PyGADM Wrapper')
|
||||
subparsers = parser.add_subparsers(dest='command', help='Command to execute')
|
||||
|
||||
# Search command
|
||||
search_parser = subparsers.add_parser('search', help='Search for regions')
|
||||
search_parser.add_argument('--query', required=True, help='Name to search for')
|
||||
search_parser.add_argument('--content_level', required=False, help='Content level filter')
|
||||
search_parser.add_argument('--geojson', action='store_true', help='Include GeoJSON geometry')
|
||||
search_parser.add_argument('--country', required=False, help='Country filter (Admin code or name)')
|
||||
|
||||
# Boundary command
|
||||
boundary_parser = subparsers.add_parser('boundary', help='Get region boundary')
|
||||
boundary_parser.add_argument('--id', required=True, help='GADM ID (for cache)')
|
||||
boundary_parser.add_argument('--name', required=True, help='Region Name')
|
||||
boundary_parser.add_argument('--content_level', required=False, help='Content Level')
|
||||
|
||||
# Names command
|
||||
names_parser = subparsers.add_parser('names', help='Get region names')
|
||||
names_parser.add_argument('--admin', required=True, help='Admin code (e.g. FRA)')
|
||||
names_parser.add_argument('--content_level', required=False, help='Content level')
|
||||
names_parser.add_argument('--depth', required=False, default=1, help='Depth of recursion (1=default, -1=infinite)')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.command == 'search':
|
||||
search_regions(args.query, args.content_level, args.geojson, args.country)
|
||||
elif args.command == 'boundary':
|
||||
get_boundary(args.id, args.name, args.content_level)
|
||||
elif args.command == 'names':
|
||||
get_names(args.admin, args.content_level, args.depth)
|
||||
else:
|
||||
parser.print_help()
|
||||
sys.exit(1)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
844
packages/shared/src/server/locations/index.ts
Normal file
844
packages/shared/src/server/locations/index.ts
Normal file
@ -0,0 +1,844 @@
|
||||
import { getLocationsRoute, getLocationsStreamRoute, getCompetitorByIdRoute, getFindEmailStreamRoute, cancelJobRoute, getRegionSearchRoute, getRegionBoundaryRoute, getRegionNamesRoute, getRegionReverseRoute, getWikiSearchRoute, getLlmRegionInfoRoute, getEnrichStreamRoute } from './routes.js';
|
||||
import { run, IKBotTask } from '@polymech/kbot-d';
|
||||
import { EnricherRegistry } from './enrichers/registry.js';
|
||||
import { MetaEnricher } from './enrichers/meta.js';
|
||||
import { enrichmentService } from './enrichers/service.js';
|
||||
import { getLocationsByPlaceIds, upsertLocations } from './db.js';
|
||||
import { reverse } from '@polymech/search';
|
||||
import { LocationsWorker, LocationsJobData, EmailWorker, EmailJobData } from './pgboss.js';
|
||||
import { LOCATIONS_JOB_OPTIONS, COST_PER_SEARCH, JOB_NAME, EMAIL_JOB_NAME, EMAIL_JOB_OPTIONS, EMAIL_SEARCH_COST, EMAIL_STREAM_TIMEOUT_MS, EMAIL_MAX_BATCH_SIZE, TIMEOUT_MS_LOCATIONS } from './constants.js';
|
||||
import { logger } from '@/commons/logger.js';
|
||||
import { get_cached, params_hash } from './cache.js';
|
||||
import { streamSSE } from 'hono/streaming';
|
||||
import { EventBus } from '../EventBus.js';
|
||||
import { exec } from 'child_process';
|
||||
import { promisify } from 'util';
|
||||
import path from 'path';
|
||||
import { fileURLToPath } from 'url';
|
||||
|
||||
const execAsync = promisify(exec);
|
||||
const __filename = fileURLToPath(import.meta.url);
|
||||
const __dirname = path.dirname(__filename);
|
||||
|
||||
import { AbstractProduct } from '../AbstractProduct.js';
|
||||
import { CONFIG_DEFAULT } from '@polymech/commons';
|
||||
|
||||
function deduplicateResults(results: any[] | null | undefined) {
|
||||
if (!Array.isArray(results)) return results;
|
||||
const seen = new Set();
|
||||
return results.filter(item => {
|
||||
const id = item.place_id || item.title; // Fallback to title if place_id missing
|
||||
if (!id) return true;
|
||||
if (seen.has(id)) return false;
|
||||
seen.add(id);
|
||||
return true;
|
||||
});
|
||||
}
|
||||
|
||||
export class LocationsProduct extends AbstractProduct<LocationsJobData | EmailJobData> {
|
||||
id = 'locations';
|
||||
workers = [LocationsWorker, EmailWorker];
|
||||
jobOptions = LOCATIONS_JOB_OPTIONS;
|
||||
|
||||
actions = {
|
||||
search: {
|
||||
costUnits: COST_PER_SEARCH,
|
||||
cancellable: true,
|
||||
description: 'Search for locations',
|
||||
},
|
||||
email: {
|
||||
costUnits: EMAIL_SEARCH_COST,
|
||||
cancellable: true,
|
||||
description: 'Find emails for locations'
|
||||
}
|
||||
};
|
||||
|
||||
routes!: any[];
|
||||
|
||||
constructor() {
|
||||
super();
|
||||
this.routes = [
|
||||
{ definition: getLocationsRoute, handler: this.handleGetLocations.bind(this) },
|
||||
{ definition: getLocationsStreamRoute, handler: this.handleStreamGet.bind(this) },
|
||||
{ definition: getCompetitorByIdRoute, handler: this.handleGetLocationById.bind(this) },
|
||||
{ definition: getFindEmailStreamRoute, handler: this.handleStreamEmail.bind(this) },
|
||||
{ definition: cancelJobRoute, handler: this.handleCancelJob.bind(this) },
|
||||
{ definition: getRegionSearchRoute, handler: this.handleGetRegionSearch.bind(this) },
|
||||
{ definition: getRegionBoundaryRoute, handler: this.handleGetRegionBoundary.bind(this) },
|
||||
{ definition: getRegionReverseRoute, handler: this.handleGetRegionReverse.bind(this) },
|
||||
{ definition: getRegionBoundaryRoute, handler: this.handleGetRegionBoundary.bind(this) },
|
||||
{ definition: getRegionReverseRoute, handler: this.handleGetRegionReverse.bind(this) },
|
||||
{ definition: getRegionNamesRoute, handler: this.handleGetRegionNames.bind(this) },
|
||||
{ definition: getRegionNamesRoute, handler: this.handleGetRegionNames.bind(this) },
|
||||
{ definition: getWikiSearchRoute, handler: this.handleGetWikiSearch.bind(this) },
|
||||
{ definition: getLlmRegionInfoRoute, handler: this.handleGetLlmRegionInfo.bind(this) },
|
||||
{ definition: getEnrichStreamRoute, handler: this.handleStreamEnrich.bind(this) }
|
||||
];
|
||||
|
||||
// Register default enrichers
|
||||
EnricherRegistry.register('meta', new MetaEnricher());
|
||||
}
|
||||
|
||||
async handleGetLlmRegionInfo(c: any) {
|
||||
const { location } = c.req.valid('query');
|
||||
try {
|
||||
const promptContent = `Provide a concise summary for ${location} strictly in valid JSON format with the following keys:
|
||||
- population: string (current estimate)
|
||||
- industry: string (key industries)
|
||||
- status: string (current geopolitical status, e.g. "Stable", "Conflict Zone", "Developing")
|
||||
- currency: string (currency name and code)
|
||||
- description: string (brief general description)
|
||||
|
||||
Do not include markdown formatting (like \`\`\`json). Just the raw JSON object.`;
|
||||
|
||||
const task: IKBotTask = {
|
||||
router: 'openai',
|
||||
model: 'gpt-4o', // Using gpt-4o as gpt-5 might not be available/stable in all envs yet, or user req said gpt-5. Let's use gpt-4o or gpt-5 if confident. User said gpt-5. Correct.
|
||||
prompt: promptContent,
|
||||
// mode: 'completion' // Default is completion? No, default might be needed.
|
||||
mode: 'completion',
|
||||
path: '.', // unused but required?
|
||||
};
|
||||
|
||||
// Override model to gpt-5 as requested if supported, otherwise fallback to gpt-4o.
|
||||
// But checking kbot source, valid models are enums. If 'gpt-5' is not in enum, it might fail if validated strictly.
|
||||
// The user explicitly asked for "router=openai model=gpt-5".
|
||||
task.model = 'gpt-5';
|
||||
|
||||
const results = await run(task);
|
||||
const result = results.length > 0 ? results[0] : null;
|
||||
|
||||
// KBot completion mode usually returns string or the completion object
|
||||
let jsonStr = '';
|
||||
if (typeof result === 'string') {
|
||||
jsonStr = result;
|
||||
} else if (result && typeof result === 'object' && 'content' in result) {
|
||||
jsonStr = (result as any).content as string;
|
||||
} else {
|
||||
// Fallback or check structure
|
||||
jsonStr = JSON.stringify(result);
|
||||
}
|
||||
|
||||
// Cleanup jsonStr if it has markdown code blocks
|
||||
jsonStr = jsonStr.replace(/```json\n?|```/g, '').trim();
|
||||
|
||||
let data;
|
||||
try {
|
||||
data = JSON.parse(jsonStr);
|
||||
} catch (e) {
|
||||
logger.error({ jsonStr }, 'Failed to parse LLM response');
|
||||
return c.json({ error: 'Failed to parse LLM response' }, 500);
|
||||
}
|
||||
|
||||
return c.json({ data }, 200);
|
||||
|
||||
} catch (error: any) {
|
||||
logger.error(error, 'Error fetching LLM info');
|
||||
return c.json({ error: error.message || 'Internal Server Error' }, 500);
|
||||
}
|
||||
}
|
||||
|
||||
async handleGetWikiSearch(c: any) {
|
||||
const { lat, lon, radius, limit } = c.req.valid('query');
|
||||
try {
|
||||
// Wikipedia API: https://en.wikipedia.org/w/api.php?action=query&list=geosearch&gscoord=37.7891838|-122.4033522&gsradius=10000&gslimit=10&format=json
|
||||
// We need a user agent: https://meta.wikimedia.org/wiki/User-Agent_policy
|
||||
|
||||
const apiUrl = `https://en.wikipedia.org/w/api.php`;
|
||||
const params = new URLSearchParams({
|
||||
action: 'query',
|
||||
list: 'geosearch',
|
||||
gscoord: `${lat}|${lon}`,
|
||||
gsradius: radius || '10000',
|
||||
gslimit: limit || '10',
|
||||
format: 'json'
|
||||
});
|
||||
|
||||
const headers = {
|
||||
'User-Agent': 'PolymechSaaS/1.0 (test@example.com)' // Replace with real contact if/when production
|
||||
};
|
||||
|
||||
const res = await fetch(`${apiUrl}?${params.toString()}`, { headers });
|
||||
|
||||
if (!res.ok) {
|
||||
return c.json({ error: 'Failed to fetch from Wikipedia' }, res.status);
|
||||
}
|
||||
|
||||
const json = await res.json();
|
||||
|
||||
if (json.error) {
|
||||
logger.error({ wikiError: json.error }, 'Wikipedia API Error');
|
||||
return c.json({ error: json.error.info || 'Wikipedia API Error' }, 500);
|
||||
}
|
||||
|
||||
const results = json.query?.geosearch || [];
|
||||
|
||||
return c.json({ data: results }, 200);
|
||||
|
||||
} catch (error: any) {
|
||||
logger.error(error, 'Error fetching Wiki data');
|
||||
return c.json({ error: error.message || 'Internal Server Error' }, 500);
|
||||
}
|
||||
}
|
||||
|
||||
async handleCancelJob(c: any) {
|
||||
console.log('[LocationsProduct] handleCancelJob called', c.req.param('jobId'));
|
||||
const { jobId } = c.req.valid('param');
|
||||
try {
|
||||
if (!this.boss) {
|
||||
logger.error('[LocationsProduct] Job system not initialized');
|
||||
return c.json({ error: 'Job system not initialized' }, 503);
|
||||
}
|
||||
await (this.boss as any).cancel(EMAIL_JOB_NAME, jobId);
|
||||
logger.info({ jobId }, '[LocationsProduct] Job cancellation requested via API');
|
||||
return c.json({ message: 'Job cancellation requested' }, 200);
|
||||
} catch (e: any) {
|
||||
logger.error({ err: e, jobId }, '[LocationsProduct] Failed to cancel job');
|
||||
return c.json({ error: 'Failed to cancel job' }, 500);
|
||||
}
|
||||
}
|
||||
|
||||
async handleGetLocationById(c: any) {
|
||||
const { place_id } = c.req.valid('param');
|
||||
try {
|
||||
const results = await getLocationsByPlaceIds([place_id]);
|
||||
const location = results && results.length > 0 ? results[0] : null;
|
||||
|
||||
if (!location) {
|
||||
return c.json({ error: 'Competitor not found' }, 404);
|
||||
}
|
||||
|
||||
return c.json({
|
||||
message: 'Get competitor details success',
|
||||
data: location
|
||||
}, 200);
|
||||
|
||||
} catch (error: any) {
|
||||
logger.error(error, 'Error fetching competitor details');
|
||||
return c.json({ error: error.message || 'Internal Server Error' }, 500);
|
||||
}
|
||||
}
|
||||
|
||||
async handleGetRegionSearch(c: any) {
|
||||
const { query, content_level, geojson, country } = c.req.valid('query');
|
||||
try {
|
||||
const wrapperPath = path.join(__dirname, 'gadm_wrapper.py');
|
||||
// Ensure python is in path
|
||||
let cmd = `python "${wrapperPath}" search --query "${query}"`;
|
||||
if (content_level) {
|
||||
cmd += ` --content_level "${content_level}"`;
|
||||
}
|
||||
if (geojson === 'true') {
|
||||
cmd += ` --geojson`;
|
||||
}
|
||||
if (country) {
|
||||
cmd += ` --country "${country}"`;
|
||||
}
|
||||
const { stdout, stderr } = await execAsync(cmd, { maxBuffer: 1024 * 1024 * 50 }); // 50MB buffer
|
||||
|
||||
if (stderr) {
|
||||
console.error('[PyGADM] Stderr:', stderr);
|
||||
}
|
||||
|
||||
try {
|
||||
const json = JSON.parse(stdout);
|
||||
if (json.error) {
|
||||
return c.json({ error: json.error }, 500);
|
||||
}
|
||||
return c.json(json, 200);
|
||||
} catch (e) {
|
||||
logger.error({ stdout }, 'Failed to parse python output');
|
||||
return c.json({ error: 'Failed to parse GADM results' }, 500);
|
||||
}
|
||||
|
||||
} catch (error: any) {
|
||||
// Check if it's a python script error that outputted JSON
|
||||
if (error.stdout) {
|
||||
try {
|
||||
const json = JSON.parse(error.stdout);
|
||||
if (json.error) {
|
||||
logger.warn({ error: json.error }, 'GADM script returned error');
|
||||
return c.json({ error: json.error }, 400); // Bad request / Search failed
|
||||
}
|
||||
} catch (e) {
|
||||
// ignore parsing error, fall through to generic error
|
||||
}
|
||||
}
|
||||
logger.error(error, 'Error searching regions');
|
||||
return c.json({ error: error.message || 'Internal Server Error' }, 500);
|
||||
}
|
||||
}
|
||||
|
||||
async handleGetRegionBoundary(c: any) {
|
||||
const { id } = c.req.valid('param');
|
||||
const { name, level } = c.req.valid('query');
|
||||
|
||||
try {
|
||||
const wrapperPath = path.join(__dirname, 'gadm_wrapper.py');
|
||||
// We need name to fetch. If not provided, we can't do much with PyGADM Items class unless cached.
|
||||
// But we can try to rely on cache by ID if it exists.
|
||||
// If not cached and name missing -> error.
|
||||
if (!name) {
|
||||
// Check cache only? Or just require name.
|
||||
// For now, let's require name from client.
|
||||
}
|
||||
|
||||
const cmd = `python "${wrapperPath}" boundary --id "${id}" --name "${name || ''}" --content_level "${level || ''}"`;
|
||||
const { stdout, stderr } = await execAsync(cmd, { maxBuffer: 1024 * 1024 * 50 }); // 50MB buffer
|
||||
|
||||
if (stderr) {
|
||||
console.error('[PyGADM Boundary] Stderr:', stderr);
|
||||
}
|
||||
|
||||
try {
|
||||
const json = JSON.parse(stdout);
|
||||
if (json.error) {
|
||||
return c.json({ error: json.error }, 404);
|
||||
}
|
||||
return c.json(json, 200);
|
||||
} catch (e) {
|
||||
logger.error({ stdout }, 'Failed to parse python output');
|
||||
return c.json({ error: 'Failed to parse GADM boundary' }, 500);
|
||||
}
|
||||
|
||||
} catch (error: any) {
|
||||
// Check if it's a python script error that outputted JSON
|
||||
if (error.stdout) {
|
||||
try {
|
||||
const json = JSON.parse(error.stdout);
|
||||
if (json.error) {
|
||||
logger.warn({ error: json.error }, 'GADM boundary script returned error');
|
||||
return c.json({ error: json.error }, 404);
|
||||
}
|
||||
} catch (e) {
|
||||
// ignore parsing error
|
||||
}
|
||||
}
|
||||
logger.error(error, 'Error fetching boundary');
|
||||
return c.json({ error: error.message || 'Internal Server Error' }, 500);
|
||||
}
|
||||
}
|
||||
|
||||
async handleGetRegionNames(c: any) {
|
||||
const { admin, content_level, depth } = c.req.valid('query');
|
||||
try {
|
||||
const wrapperPath = path.join(__dirname, 'gadm_wrapper.py');
|
||||
let cmd = `python "${wrapperPath}" names --admin "${admin}"`;
|
||||
if (content_level) cmd += ` --content_level "${content_level}"`;
|
||||
if (depth) cmd += ` --depth "${depth}"`;
|
||||
|
||||
const { stdout, stderr } = await execAsync(cmd, { maxBuffer: 1024 * 1024 * 50 }); // 50MB buffer
|
||||
|
||||
if (stderr) {
|
||||
console.error('[PyGADM] Stderr:', stderr);
|
||||
}
|
||||
|
||||
try {
|
||||
const json = JSON.parse(stdout);
|
||||
if (json.error) {
|
||||
return c.json({ error: json.error }, 500);
|
||||
}
|
||||
return c.json(json, 200);
|
||||
} catch (e) {
|
||||
logger.error({ stdout }, 'Failed to parse python output');
|
||||
return c.json({ error: 'Failed to parse GADM names results' }, 500);
|
||||
}
|
||||
|
||||
} catch (error: any) {
|
||||
// Check if it's a python script error that outputted JSON
|
||||
if (error.stdout) {
|
||||
try {
|
||||
const json = JSON.parse(error.stdout);
|
||||
if (json.error) {
|
||||
logger.warn({ error: json.error }, 'GADM names script returned error');
|
||||
return c.json({ error: json.error }, 400);
|
||||
}
|
||||
} catch (e) {
|
||||
// ignore parsing error
|
||||
}
|
||||
}
|
||||
logger.error(error, 'Error fetching region names');
|
||||
return c.json({ error: error.message || 'Internal Server Error' }, 500);
|
||||
}
|
||||
}
|
||||
|
||||
async handleGetRegionReverse(c: any) {
|
||||
const { lat, lon } = c.req.valid('query');
|
||||
try {
|
||||
// Mock LocalResult structure expected by reverse()
|
||||
const loc: any = {
|
||||
gps_coordinates: {
|
||||
latitude: parseFloat(lat),
|
||||
longitude: parseFloat(lon)
|
||||
}
|
||||
};
|
||||
|
||||
// Use 'reverse' from search package
|
||||
// It expects opts.bigdata.key. We try to read it from env.
|
||||
|
||||
const config = CONFIG_DEFAULT() as any
|
||||
await reverse(loc, { bigdata: { key: config.bigdata.key } });
|
||||
if (loc.geo) {
|
||||
return c.json({ data: loc.geo }, 200);
|
||||
} else {
|
||||
return c.json({ error: 'Reverse geocoding failed' }, 404);
|
||||
}
|
||||
} catch (error: any) {
|
||||
logger.error(error, 'Error reverse geocoding');
|
||||
return c.json({ error: error.message || 'Internal Server Error' }, 500);
|
||||
}
|
||||
}
|
||||
|
||||
hash(data: LocationsJobData) {
|
||||
const { userId, ...params } = data;
|
||||
return this.generateHash(params);
|
||||
}
|
||||
|
||||
meta(userId: string) {
|
||||
return {
|
||||
cost: COST_PER_SEARCH,
|
||||
userId,
|
||||
timestamp: new Date().toISOString()
|
||||
};
|
||||
}
|
||||
|
||||
async handleGetLocations(c: any) {
|
||||
const query = c.req.valid('query');
|
||||
const { refresh } = query;
|
||||
const forceRefresh = refresh === 'true';
|
||||
const userId = c.get('userId');
|
||||
const jobData = {
|
||||
query: query.query,
|
||||
location: query.location,
|
||||
filterCity: query.filterCity,
|
||||
filterContinent: query.filterContinent,
|
||||
filterCountry: query.filterCountry,
|
||||
filterType: query.filterType,
|
||||
limit: query.limit ? parseInt(query.limit) : 250,
|
||||
excludedTypes: query.excludedTypes,
|
||||
userId: userId
|
||||
};
|
||||
|
||||
try {
|
||||
const inputHash = params_hash({
|
||||
query: query.query,
|
||||
location: query.location,
|
||||
filterCity: query.filterCity,
|
||||
filterContinent: query.filterContinent,
|
||||
filterCountry: query.filterCountry,
|
||||
filterType: query.filterType,
|
||||
limit: query.limit ? parseInt(query.limit) : 250,
|
||||
excludedTypes: query.excludedTypes,
|
||||
zoom: query.zoom ? parseInt(query.zoom) : undefined
|
||||
});
|
||||
if (!forceRefresh) {
|
||||
const cached = await get_cached(inputHash);
|
||||
if (cached) {
|
||||
return c.json({
|
||||
message: 'Get locations success (cached)',
|
||||
data: deduplicateResults(cached)
|
||||
}, 200);
|
||||
}
|
||||
}
|
||||
const jobId = await this.sendJob(JOB_NAME, jobData, this.jobOptions);
|
||||
if (!jobId) {
|
||||
const cached = await get_cached(inputHash);
|
||||
if (cached) {
|
||||
return c.json({ message: 'Get locations success (cached)', data: deduplicateResults(cached) }, 200);
|
||||
}
|
||||
return c.json({ message: 'Job already submitted', jobId: null }, 202);
|
||||
}
|
||||
|
||||
try {
|
||||
await this.waitForJob(jobId, TIMEOUT_MS_LOCATIONS);
|
||||
const cached = await get_cached(inputHash);
|
||||
if (cached) {
|
||||
return c.json({ message: 'Get locations success (fresh)', data: deduplicateResults(cached) }, 200);
|
||||
}
|
||||
} catch (e) {
|
||||
if ((e as Error).message === 'Job timeout') {
|
||||
// Fallthrough to return 202
|
||||
} else {
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
} catch (error: any) {
|
||||
logger.error(error, 'Error fetching locations');
|
||||
return c.json({ message: error.message || 'Internal Server Error' }, 500);
|
||||
}
|
||||
|
||||
return c.json({ message: 'Job submitted' }, 202);
|
||||
}
|
||||
|
||||
async handleStreamGet(c: any) {
|
||||
const query = c.req.valid('query');
|
||||
const { refresh } = query;
|
||||
const forceRefresh = refresh === 'true';
|
||||
const userId = c.get('userId');
|
||||
|
||||
if (!userId) {
|
||||
return c.json({ message: 'Unauthorized' }, 401);
|
||||
}
|
||||
|
||||
return this.handleStream(c, {
|
||||
data: {
|
||||
query: query.query,
|
||||
location: query.location,
|
||||
filterCity: query.filterCity,
|
||||
filterContinent: query.filterContinent,
|
||||
filterCountry: query.filterCountry,
|
||||
filterType: query.filterType,
|
||||
limit: query.limit ? parseInt(query.limit) : 250,
|
||||
zoom: query.zoom ? parseInt(query.zoom) : undefined,
|
||||
excludedTypes: query.excludedTypes,
|
||||
},
|
||||
userId,
|
||||
forceRefresh,
|
||||
fetcher: async (data: any, uid: string) => {
|
||||
|
||||
const inputHash = this.generateHash(data);
|
||||
|
||||
const existing = await get_cached(inputHash);
|
||||
if (existing) return deduplicateResults(existing) as any[];
|
||||
|
||||
const jobData = { ...data, userId: uid };
|
||||
const options = { ...this.jobOptions, singletonKey: this.generateHash(data) };
|
||||
|
||||
logger.info(`Fetching locations for ${inputHash}`, jobData);
|
||||
const jobId = await this.sendJob(JOB_NAME, jobData, options);
|
||||
if (jobId) {
|
||||
await this.waitForJob(jobId, TIMEOUT_MS_LOCATIONS);
|
||||
} else {
|
||||
const checkAgain = await get_cached(inputHash);
|
||||
if (checkAgain) return deduplicateResults(checkAgain) as any[];
|
||||
|
||||
logger.info({ inputHash }, 'Waiting for debounced job...');
|
||||
await this.waitForHash(inputHash, TIMEOUT_MS_LOCATIONS);
|
||||
}
|
||||
|
||||
const result = await get_cached(inputHash);
|
||||
return deduplicateResults(result) || [];
|
||||
},
|
||||
cacheChecker: async (hash: string) => deduplicateResults(await get_cached(hash)) || null
|
||||
});
|
||||
}
|
||||
|
||||
async handleStreamEmail(c: any) {
|
||||
const { place_ids } = c.req.valid('query');
|
||||
let userId = c.get('userId') as string | undefined;
|
||||
|
||||
if (!userId) {
|
||||
return c.json({ error: 'Unauthorized' }, 401);
|
||||
}
|
||||
|
||||
const placeIdArray = place_ids.split(',').map((id: string) => id.trim()).filter((id: string) => id.length > 0);
|
||||
|
||||
if (placeIdArray.length === 0) {
|
||||
return c.json({ error: 'No place IDs provided' }, 400);
|
||||
}
|
||||
|
||||
if (placeIdArray.length > EMAIL_MAX_BATCH_SIZE) {
|
||||
return c.json({ error: `Maximum ${EMAIL_MAX_BATCH_SIZE} locations can be processed at once` }, 400);
|
||||
}
|
||||
|
||||
logger.info(`[LocationsProduct] Starting email search via worker for ${placeIdArray.length} locations`);
|
||||
|
||||
return streamSSE(c, async (stream) => {
|
||||
await stream.writeSSE({
|
||||
event: 'progress',
|
||||
data: JSON.stringify({
|
||||
current: 0,
|
||||
total: placeIdArray.length,
|
||||
percent: 0,
|
||||
message: `Queueing jobs for ${placeIdArray.length} location(s)`
|
||||
})
|
||||
});
|
||||
|
||||
const pendingJobIds = new Set<string>();
|
||||
const jobsMap = new Map<string, string>();
|
||||
let completedCount = 0;
|
||||
let successCount = 0;
|
||||
let failedCount = 0;
|
||||
let emailsFound = 0;
|
||||
|
||||
const onJobComplete = async (event: any) => {
|
||||
if (pendingJobIds.has(event.jobId)) {
|
||||
const place_id = jobsMap.get(event.jobId);
|
||||
pendingJobIds.delete(event.jobId);
|
||||
|
||||
const result = event.result;
|
||||
|
||||
await stream.writeSSE({
|
||||
event: 'location-start',
|
||||
data: JSON.stringify({
|
||||
place_id,
|
||||
title: 'Processing...',
|
||||
index: completedCount + 1,
|
||||
total: placeIdArray.length
|
||||
})
|
||||
});
|
||||
|
||||
if (result && result.emails) {
|
||||
successCount++;
|
||||
emailsFound += result.emails.length;
|
||||
|
||||
for (const emailMeta of result.emails) {
|
||||
await stream.writeSSE({
|
||||
event: 'location-email',
|
||||
data: JSON.stringify({
|
||||
place_id,
|
||||
email: emailMeta.email,
|
||||
source: emailMeta.source,
|
||||
cached: false
|
||||
})
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
await stream.writeSSE({
|
||||
event: 'location-complete',
|
||||
data: JSON.stringify({
|
||||
place_id,
|
||||
emailCount: result?.emails?.length || 0,
|
||||
cached: false
|
||||
})
|
||||
});
|
||||
|
||||
completedCount++;
|
||||
|
||||
const percent = Math.round((completedCount / placeIdArray.length) * 100);
|
||||
await stream.writeSSE({
|
||||
event: 'progress',
|
||||
data: JSON.stringify({
|
||||
current: completedCount,
|
||||
total: placeIdArray.length,
|
||||
percent,
|
||||
message: `Processed ${completedCount} of ${placeIdArray.length}`
|
||||
})
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
const onJobFailed = async (event: any) => {
|
||||
if (pendingJobIds.has(event.jobId)) {
|
||||
const place_id = jobsMap.get(event.jobId);
|
||||
pendingJobIds.delete(event.jobId);
|
||||
|
||||
failedCount++;
|
||||
|
||||
await stream.writeSSE({
|
||||
event: 'location-error',
|
||||
data: JSON.stringify({
|
||||
place_id,
|
||||
error: event.error || 'Unknown error',
|
||||
canRetry: true
|
||||
})
|
||||
});
|
||||
|
||||
completedCount++;
|
||||
const percent = Math.round((completedCount / placeIdArray.length) * 100);
|
||||
await stream.writeSSE({
|
||||
event: 'progress',
|
||||
data: JSON.stringify({
|
||||
current: completedCount,
|
||||
total: placeIdArray.length,
|
||||
percent,
|
||||
message: `Processed ${completedCount} of ${placeIdArray.length}`
|
||||
})
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
// Register Listeners BEFORE submitting jobs to capture fast completions
|
||||
EventBus.on('job:complete', onJobComplete);
|
||||
EventBus.on('job:failed', onJobFailed);
|
||||
|
||||
stream.onAbort(async () => {
|
||||
logger.info('[LocationsProduct] Stream aborted by client, cancelling pending jobs');
|
||||
for (const jobId of pendingJobIds) {
|
||||
try {
|
||||
if (this.boss) {
|
||||
await this.boss.cancel(EMAIL_JOB_NAME, jobId);
|
||||
}
|
||||
} catch (e) {
|
||||
logger.warn({ jobId, err: e }, 'Failed to cancel job on abort');
|
||||
}
|
||||
}
|
||||
clearTimeout(listTimeout);
|
||||
EventBus.off('job:complete', onJobComplete);
|
||||
EventBus.off('job:failed', onJobFailed);
|
||||
});
|
||||
|
||||
const listTimeout = setTimeout(() => {
|
||||
logger.warn('[LocationsProduct] Stream global timeout reached');
|
||||
EventBus.off('job:complete', onJobComplete);
|
||||
EventBus.off('job:failed', onJobFailed);
|
||||
stream.writeSSE({ event: 'error', data: JSON.stringify({ error: 'Global Timeout' }) });
|
||||
stream.close();
|
||||
}, EMAIL_STREAM_TIMEOUT_MS);
|
||||
|
||||
try {
|
||||
let submittedCount = 0;
|
||||
for (const place_id of placeIdArray) {
|
||||
const jobData = { place_id, userId };
|
||||
const options = { ...EMAIL_JOB_OPTIONS };
|
||||
|
||||
const jobId = await this.sendJob(EMAIL_JOB_NAME, jobData, options);
|
||||
|
||||
if (jobId) {
|
||||
pendingJobIds.add(jobId);
|
||||
jobsMap.set(jobId, place_id);
|
||||
|
||||
// Emit job info immediately for cancellation support
|
||||
await stream.writeSSE({
|
||||
event: 'location-job',
|
||||
data: JSON.stringify({ place_id, jobId })
|
||||
});
|
||||
}
|
||||
submittedCount++;
|
||||
}
|
||||
|
||||
// Wait loop
|
||||
while (pendingJobIds.size > 0) {
|
||||
// logger.info(`[LocationsProduct] ${pendingJobIds.size} pending jobs`);
|
||||
await new Promise(resolve => setTimeout(resolve, 1000));
|
||||
}
|
||||
|
||||
} catch (error: any) {
|
||||
logger.error({ err: error }, 'Error in email stream loop');
|
||||
// Attempt to send error event
|
||||
try {
|
||||
await stream.writeSSE({
|
||||
event: 'error',
|
||||
data: JSON.stringify({ error: error.message || 'Stream processing error' })
|
||||
});
|
||||
} catch (e) { /* ignore stream write errors if closed */ }
|
||||
} finally {
|
||||
clearTimeout(listTimeout);
|
||||
EventBus.off('job:complete', onJobComplete);
|
||||
EventBus.off('job:failed', onJobFailed);
|
||||
}
|
||||
|
||||
await stream.writeSSE({
|
||||
event: 'complete',
|
||||
data: JSON.stringify({
|
||||
total: placeIdArray.length,
|
||||
successful: successCount,
|
||||
failed: failedCount,
|
||||
emailsFound
|
||||
})
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
async handleStreamEnrich(c: any) {
|
||||
const { place_ids, enrichers } = c.req.valid('query');
|
||||
const userId = c.get('userId');
|
||||
|
||||
if (!userId) {
|
||||
return c.json({ error: 'Unauthorized' }, 401);
|
||||
}
|
||||
|
||||
const placeIdArray = place_ids.split(',').map((id: string) => id.trim()).filter((id: string) => id.length > 0);
|
||||
const enricherNames = enrichers.split(',').map((e: string) => e.trim()).filter((e: string) => e.length > 0);
|
||||
|
||||
if (placeIdArray.length === 0) {
|
||||
return c.json({ error: 'No place IDs provided' }, 400);
|
||||
}
|
||||
|
||||
const requestedEnrichers = enricherNames.map((name: string) => EnricherRegistry.get(name)).filter((e: any) => e !== undefined);
|
||||
|
||||
if (requestedEnrichers.length === 0) {
|
||||
return c.json({ error: 'No valid enrichers requested' }, 400);
|
||||
}
|
||||
|
||||
logger.info({ count: placeIdArray.length, enrichers: enricherNames }, '[LocationsProduct] Starting enrichment stream');
|
||||
|
||||
return streamSSE(c, async (stream) => {
|
||||
await stream.writeSSE({
|
||||
event: 'progress',
|
||||
data: JSON.stringify({
|
||||
current: 0,
|
||||
total: placeIdArray.length,
|
||||
message: `Starting enrichment for ${placeIdArray.length} items`
|
||||
})
|
||||
});
|
||||
|
||||
// Fetch current data for these places
|
||||
// For now, we fetch them from DB to have the base object.
|
||||
// If they don't exist in DB, we can't enrich them (unless we accept partials from client, but that's insecure/complex)
|
||||
// Ideally we assume they exist or we just proceed.
|
||||
const existingLocations = await getLocationsByPlaceIds(placeIdArray);
|
||||
const locationMap = new Map(existingLocations.map(l => [l.place_id, l]));
|
||||
|
||||
let processed = 0;
|
||||
|
||||
for (const placeId of placeIdArray) {
|
||||
const location = locationMap.get(placeId);
|
||||
if (!location) {
|
||||
logger.warn({ placeId }, 'Location not found for enrichment');
|
||||
// Emit error or skip
|
||||
continue;
|
||||
}
|
||||
|
||||
// Run each enricher
|
||||
let updates: any = {};
|
||||
for (const enricher of requestedEnrichers) {
|
||||
try {
|
||||
const result = await enricher!.enrich(location, { userId, logger });
|
||||
updates = { ...updates, ...result };
|
||||
|
||||
// Update our local copy for subsequent enrichers if needed
|
||||
// Object.assign(location, result);
|
||||
// Note: Deep merge might be safer, but for now top-level partials are expected.
|
||||
} catch (e: any) {
|
||||
logger.error({ placeId, enricher: enricher!.name, err: e }, 'Enricher failed');
|
||||
}
|
||||
}
|
||||
|
||||
if (Object.keys(updates).length > 0) {
|
||||
// Persist to DB
|
||||
// We need to merge updates into the location.
|
||||
// upsertLocations expects arrays.
|
||||
const updatedLocation = {
|
||||
...location,
|
||||
...updates,
|
||||
// If deep merging is needed for raw_data or others, handle it.
|
||||
// For now assume top level fields or replacement of raw_data chunks.
|
||||
raw_data: {
|
||||
...(location.raw_data || {}),
|
||||
...(updates.raw_data || {})
|
||||
}
|
||||
};
|
||||
|
||||
try {
|
||||
await upsertLocations([updatedLocation]);
|
||||
|
||||
await stream.writeSSE({
|
||||
event: 'enrichment-update',
|
||||
data: JSON.stringify({
|
||||
place_id: placeId,
|
||||
updates: updates
|
||||
})
|
||||
});
|
||||
} catch (e: any) {
|
||||
logger.error({ placeId, err: e }, 'Failed to persist enrichment updates');
|
||||
}
|
||||
}
|
||||
|
||||
processed++;
|
||||
await stream.writeSSE({
|
||||
event: 'progress',
|
||||
data: JSON.stringify({
|
||||
current: processed,
|
||||
total: placeIdArray.length,
|
||||
message: `Enriched ${processed} of ${placeIdArray.length}`
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
await stream.writeSSE({ event: 'complete', data: JSON.stringify({ processed }) });
|
||||
});
|
||||
}
|
||||
}
|
||||
172
packages/shared/src/server/locations/jobs-find.ts
Normal file
172
packages/shared/src/server/locations/jobs-find.ts
Normal file
@ -0,0 +1,172 @@
|
||||
import { googleMaps, ResolveFlags } from '@polymech/search';
|
||||
import { upsertLocations, storeSearch, getLocationsMeta } from './db.js';
|
||||
import { supabase } from '../../commons/supabase.js';
|
||||
import { params_hash } from './cache.js';
|
||||
import { logger } from './logger.js';
|
||||
|
||||
export interface SearchParams {
|
||||
query: string;
|
||||
location: string;
|
||||
filterCity?: string;
|
||||
filterContinent?: string;
|
||||
filterCountry?: string;
|
||||
filterType?: string;
|
||||
concurrency?: number;
|
||||
limit?: number;
|
||||
zoom?: number;
|
||||
excludedTypes?: string[];
|
||||
}
|
||||
|
||||
export const searchLocations = async (params: SearchParams, userId: string) => {
|
||||
logger.info({ params }, 'Fetching locations');
|
||||
|
||||
const { query, location, filterCity, filterContinent, filterCountry, filterType, limit, zoom, excludedTypes } = params;
|
||||
|
||||
// 1. Fetch from SerpApi
|
||||
const results = await googleMaps({
|
||||
query: query,
|
||||
searchFrom: location,
|
||||
resolve: [ResolveFlags.PHOTOS],
|
||||
filterCity,
|
||||
filterContinent,
|
||||
filterCountry,
|
||||
filterType,
|
||||
limit,
|
||||
zoom,
|
||||
excludedTypes
|
||||
});
|
||||
|
||||
// Flatten results
|
||||
const flatResults = results ? results.flat(Infinity) : [];
|
||||
|
||||
// 2. Map Results
|
||||
const locationsToUpsert = flatResults
|
||||
.filter((r: any) => r.place_id)
|
||||
.map((r: any) => ({
|
||||
place_id: r.place_id,
|
||||
title: r.title,
|
||||
description: r.description,
|
||||
address: r.address,
|
||||
gps_coordinates: r.gps_coordinates,
|
||||
phone: r.phone,
|
||||
website: r.website,
|
||||
operating_hours: r.operating_hours,
|
||||
thumbnail: r.thumbnail,
|
||||
types: r.types,
|
||||
raw_data: r,
|
||||
continent: r.geo?.continent,
|
||||
country: r.geo?.countryName,
|
||||
city: r.geo?.city,
|
||||
updated_at: new Date().toISOString(),
|
||||
user_id: userId
|
||||
}));
|
||||
|
||||
// 2b. Update Known Types (Side Effect)
|
||||
if (userId) {
|
||||
try {
|
||||
const { data: profile } = await supabase
|
||||
.from('profiles')
|
||||
.select('settings')
|
||||
.eq('id', userId)
|
||||
.maybeSingle();
|
||||
|
||||
const settings = profile?.settings || {};
|
||||
const knownTypes: string[] = Array.isArray(settings.known_types) ? settings.known_types : [];
|
||||
const newTypesSet = new Set(knownTypes);
|
||||
|
||||
locationsToUpsert.forEach((l: any) => {
|
||||
if (l.types && Array.isArray(l.types)) {
|
||||
l.types.forEach((t: string) => newTypesSet.add(t));
|
||||
}
|
||||
});
|
||||
|
||||
if (newTypesSet.size > knownTypes.length) {
|
||||
const updatedKnownTypes = Array.from(newTypesSet).sort();
|
||||
await supabase
|
||||
.from('profiles')
|
||||
.update({
|
||||
settings: { ...settings, known_types: updatedKnownTypes }
|
||||
})
|
||||
.eq('id', userId);
|
||||
}
|
||||
} catch (err) {
|
||||
logger.error(err, 'Error processing known types');
|
||||
}
|
||||
}
|
||||
|
||||
// 3. Preserve Meta (Emails)
|
||||
// 2a. Deduplicate Locations (Fix 21000 error)
|
||||
const uniqueLocationsMap = new Map();
|
||||
locationsToUpsert.forEach((l: any) => {
|
||||
const pid = String(l.place_id).trim();
|
||||
// Ensure place_id is cleaned
|
||||
l.place_id = pid;
|
||||
|
||||
if (!uniqueLocationsMap.has(pid)) {
|
||||
uniqueLocationsMap.set(pid, l);
|
||||
} else {
|
||||
logger.debug({ place_id: pid }, 'Duplicate place_id found in batch, skipping');
|
||||
}
|
||||
});
|
||||
const uniqueLocations = Array.from(uniqueLocationsMap.values());
|
||||
logger.info({
|
||||
total: locationsToUpsert.length,
|
||||
unique: uniqueLocations.length,
|
||||
diff: locationsToUpsert.length - uniqueLocations.length
|
||||
}, 'Deduplication stats');
|
||||
|
||||
// 3. Preserve Meta (Emails)
|
||||
const placeIds = uniqueLocations.map((l: any) => l.place_id);
|
||||
if (placeIds.length > 0) {
|
||||
const existingLocations = await getLocationsMeta(placeIds);
|
||||
if (existingLocations) {
|
||||
const metaMap = new Map(existingLocations.map((l: any) => [l.place_id, l.meta]));
|
||||
uniqueLocations.forEach((l: any) => {
|
||||
const existingMeta = metaMap.get(l.place_id);
|
||||
if (existingMeta) {
|
||||
l.raw_data.meta = {
|
||||
...(l.raw_data.meta || {}),
|
||||
...existingMeta
|
||||
};
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// 4. Upsert Locations & Store Search
|
||||
try {
|
||||
// Attempt single batch upsert for performance
|
||||
logger.info({ count: uniqueLocations.length }, 'Attempting batch upsert');
|
||||
await upsertLocations(uniqueLocations);
|
||||
logger.info({ count: uniqueLocations.length }, 'Batch upsert successful');
|
||||
} catch (err: any) {
|
||||
// Fallback to serial processing if duplicates exist within batch (Postgres 21000)
|
||||
// or any other batch-level error that might be resolvable row-by-row
|
||||
logger.warn({ err }, 'Batch upsert failed, falling back to sequential upserts');
|
||||
|
||||
// Execute upserts sequentially to safely handle "ON CONFLICT ... row a second time"
|
||||
// This effectively implements "Last Write Wins" for duplicates in the batch
|
||||
for (const loc of uniqueLocations) {
|
||||
try {
|
||||
await upsertLocations([loc]);
|
||||
} catch (innerErr) {
|
||||
logger.error({ place_id: loc.place_id, err: innerErr }, 'Failed to upsert individual location');
|
||||
// Continue processing others
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
await storeSearch(params_hash({
|
||||
query,
|
||||
location,
|
||||
filterCity,
|
||||
filterContinent,
|
||||
filterCountry,
|
||||
filterType,
|
||||
limit,
|
||||
zoom,
|
||||
excludedTypes
|
||||
}), { query, location }, placeIds);
|
||||
|
||||
return uniqueLocations;
|
||||
};
|
||||
2
packages/shared/src/server/locations/logger.ts
Normal file
2
packages/shared/src/server/locations/logger.ts
Normal file
@ -0,0 +1,2 @@
|
||||
import { logger as rootLogger } from '@/commons/logger.js';
|
||||
export const logger = rootLogger.child({ product: 'locations' });
|
||||
0
packages/shared/src/server/locations/meta.ts
Normal file
0
packages/shared/src/server/locations/meta.ts
Normal file
167
packages/shared/src/server/locations/pgboss.ts
Normal file
167
packages/shared/src/server/locations/pgboss.ts
Normal file
@ -0,0 +1,167 @@
|
||||
import { Job, PgBoss } from 'pg-boss';
|
||||
import { AbstractWorker } from '@/jobs/boss/AbstractWorker.js';
|
||||
import { Worker } from '@/commons/decorators.js';
|
||||
import { searchLocations, SearchParams } from './jobs-find.js';
|
||||
import { JOB_NAME, EMAIL_JOB_NAME, COST_PER_SEARCH, EMAIL_SEARCH_COST, EMAIL_META_TIMEOUT_MS, EMAIL_SEARCH_TIMEOUT_MS, EMAIL_SEARCH_MAX_PAGES, EMAIL_SEARCH_PAGE_TIMEOUT_MS, EMAIL_SEARCH_PAGE_CONCURRENCY, EMAIL_MAX_CONCURRENT_JOBS } from './constants.js';
|
||||
import { params_hash } from './cache.js';
|
||||
import { findEmailEach, parseHtml, Page, Meta } from '@polymech/search';
|
||||
import { getLocationByPlaceId } from '../../endpoints/competitors/getLocationByPlaceId.js';
|
||||
import { supabase } from '../../commons/supabase.js';
|
||||
import { logger } from '@/commons/logger.js';
|
||||
|
||||
export interface LocationsJobData extends SearchParams {
|
||||
userId: string;
|
||||
}
|
||||
|
||||
export interface EmailJobData {
|
||||
place_id: string;
|
||||
userId: string;
|
||||
}
|
||||
|
||||
@Worker(JOB_NAME)
|
||||
export class LocationsWorker extends AbstractWorker<LocationsJobData> {
|
||||
readonly queueName = JOB_NAME;
|
||||
|
||||
calculateCost(job: Job<LocationsJobData>, result: any): number {
|
||||
return COST_PER_SEARCH + (result?.length || 0) * 0.1;
|
||||
}
|
||||
protected async process(job: Job<LocationsJobData>) {
|
||||
const { userId, ...params } = job.data;
|
||||
const results = await searchLocations(params, userId);
|
||||
return { count: results.length, placeIds: results.map((r: any) => r.place_id) };
|
||||
}
|
||||
}
|
||||
|
||||
@Worker(EMAIL_JOB_NAME)
|
||||
export class EmailWorker extends AbstractWorker<EmailJobData> {
|
||||
readonly queueName = EMAIL_JOB_NAME;
|
||||
readonly teamSize = EMAIL_MAX_CONCURRENT_JOBS;
|
||||
|
||||
calculateCost(job: Job<EmailJobData>, result: any): number {
|
||||
return EMAIL_SEARCH_COST;
|
||||
}
|
||||
|
||||
protected async process(job: Job<EmailJobData>) {
|
||||
logger.info(`[EmailWorker] Processing job ${job.id} for place_id: ${job.data.place_id}`);
|
||||
|
||||
const { place_id, userId } = job.data;
|
||||
|
||||
// 1. Fetch location details
|
||||
const location = await getLocationByPlaceId(place_id);
|
||||
if (!location) {
|
||||
throw new Error('Location not found');
|
||||
}
|
||||
|
||||
// 2. Check for existing emails (maybe skip this if forceRefresh is handled by caller? assuming always fresh search requested or caller handles cache)
|
||||
// For job simplicity, we always assume we want to enrich if we are here.
|
||||
// But we should check if website exists
|
||||
if (!location.website) {
|
||||
throw new Error('No website available');
|
||||
}
|
||||
|
||||
// 3. Ensure Meta Data
|
||||
if (!location.meta || !location.meta.pages) {
|
||||
logger.info(`[EmailWorker] Meta missing for ${place_id} : ${location}, fetching...`);
|
||||
const meta = await parseHtml(location.website, null, {
|
||||
headless: true,
|
||||
timeout: EMAIL_META_TIMEOUT_MS
|
||||
});
|
||||
if (meta) {
|
||||
location.meta = { ...location.meta, ...meta } as any;
|
||||
// Update DB with meta
|
||||
await supabase.from('locations').update({ meta: location.meta }).eq('place_id', place_id);
|
||||
}
|
||||
}
|
||||
|
||||
// 4. Execute Email Scraping
|
||||
logger.info(`[EmailWorker] Starting email search for ${place_id} : ${location.title}`);
|
||||
|
||||
let isCancelled = false;
|
||||
const checkCancelled = async () => {
|
||||
if (isCancelled) return true;
|
||||
if (!this.boss) return false;
|
||||
try {
|
||||
const currentJob = await this.boss.getJobById(job.name, job.id);
|
||||
const dbCancelled = !!(currentJob && (currentJob as any).state === 'cancelled');
|
||||
if (dbCancelled) isCancelled = true;
|
||||
return isCancelled;
|
||||
} catch (e) {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
const searchPromise = findEmailEach(location, {
|
||||
headless: process.env.EMAIL_SEARCH_HEADLESS === 'false' ? false : true,
|
||||
searchFrom: 'api',
|
||||
abortAfter: 1,
|
||||
maxPages: EMAIL_SEARCH_MAX_PAGES,
|
||||
pageTimeout: EMAIL_SEARCH_PAGE_TIMEOUT_MS,
|
||||
concurrency: EMAIL_SEARCH_PAGE_CONCURRENCY,
|
||||
checkCancelled
|
||||
}, async (page: Page) => {
|
||||
logger.info(`[EmailWorker] Found email for ${place_id}`);
|
||||
if (await checkCancelled()) {
|
||||
logger.info(`[EmailWorker] Job cancelled for ${place_id}`);
|
||||
throw new Error('Cancelled');
|
||||
}
|
||||
});
|
||||
|
||||
// We use a timeout wrapper similar to original code
|
||||
let emails: string[] = [];
|
||||
logger.info(`[EmailWorker] Starting search with timeout: ${EMAIL_SEARCH_TIMEOUT_MS}ms`);
|
||||
try {
|
||||
const result = await Promise.race([
|
||||
searchPromise,
|
||||
new Promise((_, reject) => setTimeout(() => {
|
||||
logger.warn(`[EmailWorker] Timeout reached for ${place_id}, signalling cancellation...`);
|
||||
isCancelled = true;
|
||||
reject(new Error('Timeout'));
|
||||
}, EMAIL_SEARCH_TIMEOUT_MS))
|
||||
]);
|
||||
if (result) emails = result as string[];
|
||||
} catch (e: any) {
|
||||
if (e.message === 'Timeout') {
|
||||
logger.warn(`[EmailWorker] Timeout searching emails for ${place_id}`);
|
||||
// Proceed with what we have (empty)
|
||||
} else if (e.message === 'Cancelled' || e.message === 'CancelledByUser') {
|
||||
logger.warn(`[EmailWorker] Job cancelled for ${place_id}`);
|
||||
// Proceed with what we have (likely empty or partial)
|
||||
} else {
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
// Format emails
|
||||
const emailsWithMetadata = emails.map((email) => ({
|
||||
email,
|
||||
source: location.website || '',
|
||||
foundAt: new Date().toISOString(),
|
||||
tool: 'puppeteer'
|
||||
}));
|
||||
|
||||
// Update location with emails
|
||||
if (emailsWithMetadata.length > 0) {
|
||||
logger.info(`[EmailWorker] Updating location ${place_id} with ${emailsWithMetadata.length} emails`);
|
||||
await supabase
|
||||
.from('locations')
|
||||
.update({
|
||||
meta: {
|
||||
...location.meta,
|
||||
emails: emailsWithMetadata // Replaces emails with fresh finding
|
||||
},
|
||||
updated_at: new Date().toISOString()
|
||||
})
|
||||
.eq('place_id', place_id);
|
||||
} else {
|
||||
logger.info(`[EmailWorker] Search finished for ${place_id}. No emails found.`);
|
||||
}
|
||||
|
||||
return {
|
||||
place_id,
|
||||
emailsCount: emailsWithMetadata.length,
|
||||
emails: emailsWithMetadata
|
||||
};
|
||||
}
|
||||
}
|
||||
375
packages/shared/src/server/locations/routes.ts
Normal file
375
packages/shared/src/server/locations/routes.ts
Normal file
@ -0,0 +1,375 @@
|
||||
import { createRoute } from '@hono/zod-openapi';
|
||||
import { LocationRequestSchema, LocationResponseSchema, LocationDetailResponseSchema } from './schemas.js';
|
||||
import { Public } from '@/commons/decorators.js';
|
||||
import { z } from 'zod';
|
||||
|
||||
import { authMiddleware, optionalAuthMiddleware } from '../../middleware/auth.js';
|
||||
|
||||
const tags = ['Locations'];
|
||||
|
||||
export const RequestSchema = z.object({
|
||||
query: z.string().default(''),
|
||||
location: z.string().optional().default('barcelona, spain'),
|
||||
refresh: z.string().optional().default('false'),
|
||||
filterCountry: z.string().optional(),
|
||||
filterCity: z.string().optional(),
|
||||
filterContinent: z.string().optional(),
|
||||
filterType: z.string().optional(),
|
||||
limit: z.string().optional().default('250'),
|
||||
zoom: z.string().optional().describe('Map zoom level'),
|
||||
excludedTypes: z.union([z.string(), z.array(z.string())]).optional().transform((val) => {
|
||||
if (!val) return undefined;
|
||||
if (typeof val === 'string') return [val];
|
||||
return val;
|
||||
}),
|
||||
}).loose()
|
||||
|
||||
export type CompetitorRequest = z.infer<typeof RequestSchema>;
|
||||
|
||||
export const cancelJobRoute = createRoute({
|
||||
method: 'post',
|
||||
path: '/api/competitors/email/job/{jobId}/cancel',
|
||||
tags,
|
||||
middleware: [authMiddleware] as const,
|
||||
request: {
|
||||
params: z.object({
|
||||
jobId: z.string(),
|
||||
}),
|
||||
},
|
||||
responses: {
|
||||
200: {
|
||||
content: {
|
||||
'application/json': {
|
||||
schema: z.object({
|
||||
message: z.string(),
|
||||
}),
|
||||
},
|
||||
},
|
||||
description: 'Job cancelled successfully',
|
||||
},
|
||||
404: {
|
||||
description: 'Job not found',
|
||||
},
|
||||
500: {
|
||||
description: 'Server error',
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
export const getLocationsRoute = createRoute({
|
||||
method: 'get',
|
||||
path: '/api/competitors',
|
||||
tags,
|
||||
middleware: [authMiddleware] as const,
|
||||
request: {
|
||||
query: RequestSchema,
|
||||
},
|
||||
responses: {
|
||||
202: {
|
||||
content: {
|
||||
'application/json': {
|
||||
schema: z.object({
|
||||
message: z.string(),
|
||||
jobId: z.string().nullable(),
|
||||
}),
|
||||
},
|
||||
},
|
||||
description: 'Job submitted',
|
||||
},
|
||||
200: {
|
||||
content: {
|
||||
'application/json': {
|
||||
schema: LocationResponseSchema,
|
||||
},
|
||||
},
|
||||
description: 'Retrieve locations (cached)',
|
||||
},
|
||||
500: {
|
||||
description: 'Server error',
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
export const getLocationsStreamRoute = createRoute({
|
||||
method: 'get',
|
||||
path: '/api/competitors/stream',
|
||||
tags,
|
||||
middleware: [authMiddleware] as const,
|
||||
request: {
|
||||
query: LocationRequestSchema,
|
||||
},
|
||||
responses: {
|
||||
200: {
|
||||
description: 'Server-Sent Events stream of location results',
|
||||
content: {
|
||||
'text/event-stream': {
|
||||
schema: {
|
||||
type: 'string'
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
500: {
|
||||
description: 'Server error',
|
||||
},
|
||||
},
|
||||
})
|
||||
|
||||
export const getCompetitorByIdRoute = Public(createRoute({
|
||||
method: 'get',
|
||||
path: '/api/competitors/{place_id}',
|
||||
tags,
|
||||
request: {
|
||||
params: z.object({
|
||||
place_id: z.string(),
|
||||
}),
|
||||
},
|
||||
responses: {
|
||||
200: {
|
||||
content: {
|
||||
'application/json': {
|
||||
schema: LocationDetailResponseSchema,
|
||||
},
|
||||
},
|
||||
description: 'Retrieve competitor details',
|
||||
},
|
||||
404: {
|
||||
description: 'Competitor not found',
|
||||
},
|
||||
500: {
|
||||
description: 'Server error',
|
||||
},
|
||||
},
|
||||
}));
|
||||
|
||||
export const getLlmRegionInfoRoute = Public(createRoute({
|
||||
method: 'get',
|
||||
path: '/api/locations/llm-info',
|
||||
tags,
|
||||
request: {
|
||||
query: z.object({
|
||||
location: z.string().describe('Location name to get info for'),
|
||||
}),
|
||||
},
|
||||
responses: {
|
||||
200: {
|
||||
description: 'LLM Region Info',
|
||||
content: {
|
||||
'application/json': {
|
||||
schema: z.object({
|
||||
data: z.any(),
|
||||
}),
|
||||
},
|
||||
},
|
||||
},
|
||||
500: {
|
||||
description: 'Server error',
|
||||
},
|
||||
},
|
||||
}));
|
||||
|
||||
export const getFindEmailStreamRoute = createRoute({
|
||||
method: 'get',
|
||||
path: '/api/competitors/email/stream',
|
||||
tags,
|
||||
middleware: [authMiddleware] as const,
|
||||
request: {
|
||||
query: z.object({
|
||||
place_ids: z.string().describe('Comma-separated list of place IDs'),
|
||||
token: z.string().optional().describe('Auth token for SSE'),
|
||||
}),
|
||||
},
|
||||
responses: {
|
||||
200: {
|
||||
description: 'Server-Sent Events stream of email finding results',
|
||||
content: {
|
||||
'text/event-stream': {
|
||||
schema: {
|
||||
type: 'string'
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
500: {
|
||||
description: 'Server error',
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
export const getRegionSearchRoute = Public(createRoute({
|
||||
method: 'get',
|
||||
path: '/api/regions/search',
|
||||
tags,
|
||||
request: {
|
||||
query: z.object({
|
||||
query: z.string().describe('Region name to search for'),
|
||||
content_level: z.string().optional().describe('Filter by content level (e.g. 1)'),
|
||||
geojson: z.string().optional().describe('Include GeoJSON geometry (true/false)'),
|
||||
}),
|
||||
},
|
||||
responses: {
|
||||
200: {
|
||||
description: 'Search results',
|
||||
content: {
|
||||
'application/json': {
|
||||
schema: z.object({
|
||||
data: z.array(z.any()),
|
||||
}),
|
||||
},
|
||||
},
|
||||
},
|
||||
500: {
|
||||
description: 'Server error',
|
||||
},
|
||||
},
|
||||
}));
|
||||
|
||||
export const getRegionBoundaryRoute = Public(createRoute({
|
||||
method: 'get',
|
||||
path: '/api/regions/boundary/{id}',
|
||||
tags,
|
||||
request: {
|
||||
params: z.object({
|
||||
id: z.string().describe('GADM ID of the region'),
|
||||
}),
|
||||
query: z.object({
|
||||
name: z.string().optional().describe('Region Name (Required for GADM fetch)'),
|
||||
level: z.string().optional().describe('Content Level'),
|
||||
}),
|
||||
},
|
||||
responses: {
|
||||
200: {
|
||||
description: 'GeoJSON boundary',
|
||||
content: {
|
||||
'application/json': {
|
||||
schema: z.object({
|
||||
data: z.any(), // GeoJSON object
|
||||
}),
|
||||
},
|
||||
},
|
||||
},
|
||||
404: {
|
||||
description: 'Region not found',
|
||||
},
|
||||
500: {
|
||||
description: 'Server error',
|
||||
},
|
||||
},
|
||||
}));
|
||||
|
||||
export const getRegionReverseRoute = Public(createRoute({
|
||||
method: 'get',
|
||||
path: '/api/regions/reverse',
|
||||
tags,
|
||||
request: {
|
||||
query: z.object({
|
||||
lat: z.string().describe('Latitude'),
|
||||
lon: z.string().describe('Longitude'),
|
||||
}),
|
||||
},
|
||||
responses: {
|
||||
200: {
|
||||
content: {
|
||||
'application/json': {
|
||||
schema: z.object({
|
||||
data: z.any().describe('Reverse geocoding result')
|
||||
}),
|
||||
},
|
||||
},
|
||||
description: 'Reverse geocoding successful',
|
||||
},
|
||||
500: {
|
||||
content: {
|
||||
'application/json': {
|
||||
schema: z.object({
|
||||
error: z.string(),
|
||||
}),
|
||||
},
|
||||
},
|
||||
description: 'Server error',
|
||||
},
|
||||
},
|
||||
}));
|
||||
|
||||
export const getRegionNamesRoute = Public(createRoute({
|
||||
method: 'get',
|
||||
path: '/api/regions/names',
|
||||
tags,
|
||||
request: {
|
||||
query: z.object({
|
||||
admin: z.string().describe('Admin code (e.g. FRA, FRA.1_1)'),
|
||||
content_level: z.string().describe('Content level (e.g. 2)').default('2'),
|
||||
}),
|
||||
},
|
||||
responses: {
|
||||
200: {
|
||||
description: 'Region names results',
|
||||
content: {
|
||||
'application/json': {
|
||||
schema: z.object({
|
||||
data: z.array(z.any()),
|
||||
}),
|
||||
},
|
||||
},
|
||||
},
|
||||
500: {
|
||||
description: 'Server error',
|
||||
},
|
||||
},
|
||||
}));
|
||||
export const getWikiSearchRoute = Public(createRoute({
|
||||
method: 'get',
|
||||
path: '/api/locations/wiki',
|
||||
tags,
|
||||
request: {
|
||||
query: z.object({
|
||||
lat: z.string().describe('Latitude'),
|
||||
lon: z.string().describe('Longitude'),
|
||||
radius: z.string().optional().default('10000').describe('Radius in meters (max 10000)'), // WP limit is usually 10000
|
||||
limit: z.string().optional().default('10').describe('Limit results'),
|
||||
}),
|
||||
},
|
||||
responses: {
|
||||
200: {
|
||||
description: 'Wiki geosearch results',
|
||||
content: {
|
||||
'application/json': {
|
||||
schema: z.object({
|
||||
data: z.array(z.any()),
|
||||
}),
|
||||
},
|
||||
},
|
||||
},
|
||||
500: {
|
||||
description: 'Server error',
|
||||
},
|
||||
},
|
||||
}));
|
||||
|
||||
export const getEnrichStreamRoute = createRoute({
|
||||
method: 'get',
|
||||
path: '/api/competitors/enrich/stream',
|
||||
tags,
|
||||
middleware: [authMiddleware] as const,
|
||||
request: {
|
||||
query: z.object({
|
||||
place_ids: z.string().describe('Comma-separated list of place IDs'),
|
||||
enrichers: z.string().describe('Comma-separated list of enrichers (e.g. meta)'),
|
||||
}),
|
||||
},
|
||||
responses: {
|
||||
200: {
|
||||
description: 'Server-Sent Events stream of enrichment results',
|
||||
content: {
|
||||
'text/event-stream': {
|
||||
schema: {
|
||||
type: 'string'
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
500: {
|
||||
description: 'Server error',
|
||||
},
|
||||
},
|
||||
});
|
||||
28
packages/shared/src/server/locations/schemas.ts
Normal file
28
packages/shared/src/server/locations/schemas.ts
Normal file
@ -0,0 +1,28 @@
|
||||
import { z } from '@hono/zod-openapi';
|
||||
import { CompetitorResponseSchema, CompetitorDetailResponseSchema } from '@polymech/shared';
|
||||
|
||||
// Define Request Schema locally for server/OpenAPI compatibility
|
||||
export const LocationRequestSchema = z.object({
|
||||
query: z.string().default('plastichub'),
|
||||
location: z.string().optional().default('barcelona, spain'),
|
||||
refresh: z.string().optional().default('false'),
|
||||
filterCountry: z.string().optional(),
|
||||
filterCity: z.string().optional(),
|
||||
filterContinent: z.string().optional(),
|
||||
filterType: z.string().optional(),
|
||||
concurrency: z.string().optional().default('5'),
|
||||
limit: z.string().optional().default('250'),
|
||||
zoom: z.string().optional().describe('Map zoom level'),
|
||||
excludedTypes: z.union([z.string(), z.array(z.string())]).optional().transform((val) => {
|
||||
if (!val) return undefined;
|
||||
if (typeof val === 'string') return [val];
|
||||
return val;
|
||||
}),
|
||||
});
|
||||
|
||||
export const LocationResponseSchema = CompetitorResponseSchema;
|
||||
export const LocationDetailResponseSchema = CompetitorDetailResponseSchema;
|
||||
|
||||
export type LocationRequest = z.infer<typeof LocationRequestSchema>;
|
||||
export type LocationResponse = z.infer<typeof LocationResponseSchema>;
|
||||
export type LocationDetailResponse = z.infer<typeof LocationDetailResponseSchema>;
|
||||
107
packages/shared/src/server/locations/stream.ts
Normal file
107
packages/shared/src/server/locations/stream.ts
Normal file
@ -0,0 +1,107 @@
|
||||
import { createRoute, RouteHandler } from '@hono/zod-openapi';
|
||||
import { streamSSE } from 'hono/streaming';
|
||||
import { LocationRequestSchema } from './schemas.js';
|
||||
import { searchLocations } from './jobs-find.js';
|
||||
import { get_cached, params_hash } from './cache.js';
|
||||
import { HonoEnv } from '@/commons/types.js';
|
||||
import { logger } from './logger.js';
|
||||
|
||||
const tags = ['Locations'];
|
||||
|
||||
export const getLocationsStreamRoute = createRoute({
|
||||
method: 'get',
|
||||
path: '/api/locations/stream',
|
||||
tags,
|
||||
request: {
|
||||
query: LocationRequestSchema,
|
||||
},
|
||||
responses: {
|
||||
200: {
|
||||
description: 'Server-Sent Events stream of location results',
|
||||
content: {
|
||||
'text/event-stream': {
|
||||
schema: {
|
||||
type: 'string'
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
500: {
|
||||
description: 'Server error',
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
export const stream_get: RouteHandler<typeof getLocationsStreamRoute, HonoEnv> = async (c) => {
|
||||
const query = c.req.valid('query');
|
||||
const { refresh } = query;
|
||||
const forceRefresh = refresh === 'true';
|
||||
const userId = c.get('userId');
|
||||
|
||||
if (!userId) {
|
||||
logger.error('Unauthorized : no userId');
|
||||
return c.json({ message: 'Unauthorized' }, 401);
|
||||
}
|
||||
|
||||
const inputHash = params_hash({ query: query.query, location: query.location });
|
||||
|
||||
return streamSSE(c, async (stream) => {
|
||||
try {
|
||||
await stream.writeSSE({
|
||||
event: 'progress',
|
||||
data: JSON.stringify({ stage: 'starting', percent: 0 })
|
||||
});
|
||||
|
||||
if (!forceRefresh) {
|
||||
await stream.writeSSE({
|
||||
event: 'progress',
|
||||
data: JSON.stringify({ stage: 'checking_cache', percent: 10 })
|
||||
});
|
||||
|
||||
const cached = await get_cached(inputHash);
|
||||
if (cached) {
|
||||
for (let i = 0; i < cached.length; i++) {
|
||||
await stream.writeSSE({
|
||||
event: 'result',
|
||||
data: JSON.stringify(cached[i])
|
||||
});
|
||||
}
|
||||
await stream.writeSSE({
|
||||
event: 'complete',
|
||||
data: JSON.stringify({ total: cached.length, cached: true })
|
||||
});
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
await stream.writeSSE({
|
||||
event: 'progress',
|
||||
data: JSON.stringify({ stage: 'fetching_from_api', percent: 20 })
|
||||
});
|
||||
|
||||
// Note: searchLocations currently awaits all results.
|
||||
// Ideally, we would refactor searchLocations to yield results or accept a callback.
|
||||
// For now, we fetch all then stream.
|
||||
const results = await searchLocations(query as any, userId);
|
||||
|
||||
for (let i = 0; i < results.length; i++) {
|
||||
await stream.writeSSE({
|
||||
event: 'result',
|
||||
data: JSON.stringify(results[i])
|
||||
});
|
||||
}
|
||||
|
||||
await stream.writeSSE({
|
||||
event: 'complete',
|
||||
data: JSON.stringify({ total: results.length, cached: false })
|
||||
});
|
||||
|
||||
} catch (error: any) {
|
||||
logger.error(error, 'Stream error');
|
||||
await stream.writeSSE({
|
||||
event: 'error',
|
||||
data: JSON.stringify({ error: error.message || 'Internal Server Error' })
|
||||
});
|
||||
}
|
||||
});
|
||||
};
|
||||
34
packages/shared/src/server/registry.ts
Normal file
34
packages/shared/src/server/registry.ts
Normal file
@ -0,0 +1,34 @@
|
||||
import { PgBoss } from 'pg-boss';
|
||||
import { AbstractProduct } from './AbstractProduct.js';
|
||||
import { LocationsProduct } from './locations/index.js';
|
||||
import './subscriber.js';
|
||||
|
||||
export const ALL_PRODUCTS: AbstractProduct[] =
|
||||
[
|
||||
new LocationsProduct()
|
||||
];
|
||||
|
||||
|
||||
// Helper to get all workers
|
||||
export const getAllWorkers = () => {
|
||||
return ALL_PRODUCTS.flatMap(p => p.workers || []);
|
||||
};
|
||||
|
||||
// Helper to register routes
|
||||
export const registerProductRoutes = (app: any) => {
|
||||
ALL_PRODUCTS.forEach(product => {
|
||||
product.routes.forEach(route => {
|
||||
// @ts-ignore - Hono types might mismatch slightly
|
||||
app.openapi(route.definition, route.handler);
|
||||
});
|
||||
});
|
||||
};
|
||||
|
||||
|
||||
|
||||
// Helper to initialize products (lifecycle: start)
|
||||
export const startProducts = async (boss: PgBoss) => {
|
||||
for (const product of ALL_PRODUCTS) {
|
||||
await product.start(boss);
|
||||
}
|
||||
};
|
||||
42
packages/shared/src/server/subscriber.ts
Normal file
42
packages/shared/src/server/subscriber.ts
Normal file
@ -0,0 +1,42 @@
|
||||
import { ALL_PRODUCTS } from './registry.js';
|
||||
import { EventBus } from './EventBus.js';
|
||||
|
||||
const findProductByQueue = (queue: string) => {
|
||||
return ALL_PRODUCTS.find(p =>
|
||||
p.workers?.some(w => {
|
||||
try {
|
||||
const worker = new (w as any)();
|
||||
return worker.queueName === queue;
|
||||
} catch (e) {
|
||||
return false;
|
||||
}
|
||||
})
|
||||
);
|
||||
};
|
||||
|
||||
EventBus.on('job:create', (event: any) => {
|
||||
const product = findProductByQueue(event.queue);
|
||||
|
||||
if (!product) return;
|
||||
|
||||
// Apply default job options from product if available
|
||||
if (product.jobOptions) {
|
||||
event.options = { ...product.jobOptions, ...event.options };
|
||||
}
|
||||
|
||||
const singletonKey = product.hash(event.data);
|
||||
if (singletonKey) {
|
||||
event.options.singletonKey = singletonKey;
|
||||
// Default to 5 minutes if not specified
|
||||
if (!event.options.singletonSeconds) {
|
||||
event.options.singletonSeconds = 300;
|
||||
}
|
||||
}
|
||||
|
||||
const { userId } = event.data;
|
||||
if (userId) {
|
||||
const metadata = product.meta(userId);
|
||||
event.data = { ...event.data, ...metadata };
|
||||
}
|
||||
|
||||
});
|
||||
Loading…
Reference in New Issue
Block a user