kbot iterator notes

This commit is contained in:
lovebird 2025-04-07 12:39:52 +02:00
parent 243d8c2fad
commit 07fe877eee
10 changed files with 425 additions and 24 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -1,5 +1,11 @@
import { IKBotTask } from '@polymech/ai-tools';
import { AsyncTransformer, ErrorCallback, FilterCallback } from './async-iterator.js';
/**
* Notes for LLM modifications
*
* - this is a wrapper around the async-iterator.ts file, implementing application layer caching and other features
* - to test it, use `npm run examples:iterator-factory`
*/
export interface ILogger {
info: (message: string) => void;
warn: (message: string) => void;
@ -42,8 +48,6 @@ export declare function transformWithMappings(obj: Record<string, any>, createTr
filterCallback?: FilterCallback;
maxRetries?: number;
retryDelay?: number;
logger?: {
error: (message: string, error?: any) => void;
};
logger?: ILogger;
cacheConfig?: CacheConfig;
}): Promise<void>;

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,376 @@
# Iterator Implementation Review
## Potential Bugs and Edge Cases
### Type Safety Issues
1. Excessive use of `any` type in key functions:
- `removeEmptyObjects` function uses `any` return type and parameter (line 19)
- Limited type checking in cache key generation and object cloning
### Error Handling
1. Inconsistent error handling:
- In `createLLMTransformer`, errors are caught but only logged (line 106) without retry mechanism outside of `transformPath`
- Retry mechanism in `transformPath` uses exponential backoff but lacks circuit breaking capability
2. API errors not properly categorized:
- No distinction between transient errors (like rate limits) and permanent errors (like invalid requests)
- Missing status code handling from LLM API responses
- No handling of network timeouts for long-running LLM requests
### Cache Implementation
1. Cache key generation issues:
- Cache key for `createObjectCacheKey` (line 137) uses JSON.stringify on full data objects, which may:
- Create extremely large cache keys
- Fail with circular references
- Generate different keys for identical logical objects if properties are in different orders
2. Cache expiration:
- Fixed default expiration time (7 days) might not be suitable for all use cases
- No mechanism to force refresh or invalidate specific cache entries
3. Cache isolation:
- No isolation between different versions of models (newer models might give better results)
- No context-based cache namespacing (different applications using same cache)
### Concurrency and Performance
1. Fixed throttling implementation:
- `throttleDelay` is applied globally without considering API rate limits
- Default concurrency of 1 may be overly cautious for some APIs
- No adaptability to different LLM providers' rate limit policies
2. JSON parsing overhead:
- Deep cloning via `JSON.parse(JSON.stringify())` in multiple places (lines 189, 208) can cause:
- Performance issues with large objects
- Loss of data for values that don't serialize to JSON (e.g., Date objects, functions)
- Memory spikes during transformation
3. Inefficient parallel execution:
- The iterator processes field mappings sequentially rather than in parallel batches
- No priority system for more important transformations
### Data Integrity
1. Deep merge implementation risks:
- The custom `deepMerge` function (line 144) doesn't properly handle arrays
- No protection against prototype pollution
- May overwrite existing values unexpectedly
2. JSONPath implementation limitations:
- No validation of JSONPath syntax
- No handling for missing paths
- Potential for duplicate updates when JSONPath matches multiple nodes
### Integration Issues
1. LLM integration rigidity:
- Tight coupling to specific LLM API structure in `createLLMTransformer`
- Limited flexibility for different output formats (assumes string response)
- No streaming support for larger transformations
2. Missing validation for prompt templates:
- No checking if prompts exceed token limits
- Prompts are concatenated with input without token awareness
- No handling of LLM context windows
## Suggested Improvements
### Type Safety
1. Replace uses of `any` with proper type definitions:
```typescript
export const removeEmptyObjects = <T>(obj: T): T => {
// Implementation with proper type checking
}
```
2. Define stricter interfaces for cache keys and values:
```typescript
interface CacheKey {
prompt: string;
model?: string;
router?: string;
mode?: string;
}
```
### Error Handling
1. Implement consistent error handling strategy:
```typescript
// Add proper error classes
export class TransformError extends Error {
constructor(public path: string, public originalValue: string, public cause: Error) {
super(`Error transforming ${path}: ${cause.message}`);
this.name = 'TransformError';
}
}
```
2. Add circuit breaker pattern for API calls:
```typescript
// In createLLMTransformer
const circuitBreaker = new CircuitBreaker({
failureThreshold: 3,
resetTimeout: 30000
});
return async (input: string, jsonPath: string): Promise<string> => {
return circuitBreaker.fire(() => callLLMAPI(input, jsonPath));
};
```
3. Categorize and handle API errors appropriately:
```typescript
async function handleLLMRequest(task: IKBotTask, input: string): Promise<string> {
try {
return await run(task);
} catch (error) {
if (error.status === 429) {
// Rate limit - back off and retry
return await retryWithExponentialBackoff(() => run(task));
} else if (error.status >= 400 && error.status < 500) {
// Client error - fix request or abort
throw new ClientError(error.message);
} else {
// Server error - retry with caution
return await retryWithLinearBackoff(() => run(task));
}
}
}
```
### Cache Implementation
1. Improve cache key generation:
```typescript
const createCacheKey = (task: IKBotTask, input: string): string => {
// Create deterministic hash of relevant properties only
const keyObj = {
prompt: task.prompt,
model: task.model,
input: input.substring(0, 100) // Limit input size in key
};
return createHash('sha256').update(JSON.stringify(keyObj)).digest('hex');
};
```
2. Add cache control capabilities:
```typescript
export interface CacheConfig {
enabled?: boolean;
namespace?: string;
expiration?: number;
forceRefresh?: boolean;
keyGenerator?: (task: IKBotTask, input: string) => string;
versionStrategy?: 'model-based' | 'time-based' | 'none';
}
```
3. Implement context-aware cache namespacing:
```typescript
function createContextualNamespace(config: CacheConfig, options: IKBotTask): string {
const appId = options.appId || 'default';
const modelVersion = options.model?.replace(/[^\w]/g, '-') || 'unknown-model';
return `${config.namespace || 'llm-responses'}-${appId}-${modelVersion}`;
}
```
### Concurrency and Performance
1. Replace deep cloning with structured cloning or immutable data libraries:
```typescript
import { structuredClone } from 'node:util'; // Node.js 17+
// Replace JSON.parse(JSON.stringify(obj)) with:
const transformedObj = structuredClone(obj);
```
2. Add adaptive throttling based on API responses:
```typescript
const adaptiveThrottle = createAdaptiveThrottle({
initialLimit: 10,
initialInterval: 1000,
maxLimit: 50,
adjustOnError: (err) => {
// Check rate limit errors and adjust accordingly
}
});
```
3. Implement parallel batch processing:
```typescript
// Process mappings in parallel batches
async function transformInBatches(obj: Record<string, any>, mappings: FieldMapping[], batchSize: number = 3) {
const batches = [];
for (let i = 0; i < mappings.length; i += batchSize) {
batches.push(mappings.slice(i, i + batchSize));
}
for (const batch of batches) {
await Promise.all(batch.map(mapping => processMapping(obj, mapping)));
}
}
```
### Interface Improvements
1. Simplify the API for common use cases:
```typescript
// Simple transform helper
export async function transform<T>(
data: T,
mapping: FieldMapping | FieldMapping[],
options?: Partial<IKBotTask>
): Promise<T> {
const mappings = Array.isArray(mapping) ? mapping : [mapping];
const result = structuredClone(data);
await createIterator(result, options || {}).transform(mappings);
return result;
}
```
2. Add typesafe JSONPath:
```typescript
// Type-safe JSONPath function
export function createTypeSafePath<T, R>(
path: string,
validator: (value: unknown) => value is R
): JSONPathSelector<T, R> {
// Implementation
}
```
3. Support streaming transformations:
```typescript
export interface StreamOptions extends IOptions {
onProgress?: (current: number, total: number) => void;
onFieldTransform?: (path: string, before: string, after: string) => void;
}
export function createStreamingIterator(
obj: Record<string, any>,
optionsMixin: Partial<IKBotTask>,
streamOptions: StreamOptions
): IteratorFactory {
// Implementation with callbacks for progress updates
}
```
## Alternative Libraries
### Lightweight Alternatives
1. **JSONata** instead of JSONPath
- More expressive query language
- Smaller footprint (54KB vs 120KB)
- Built-in transformation capabilities
- Example conversion:
```typescript
// Instead of JSONPath:
const paths = JSONPath({ path: '$.products.fruits[*].description', json: obj });
// With JSONata:
const result = jsonata('products.fruits.description').evaluate(obj);
```
2. **p-limit** instead of p-throttle and p-map
- Simpler API
- More focused functionality
- Smaller bundle size
- Example conversion:
```typescript
// Instead of:
const throttle = pThrottle({
limit: 1,
interval: throttleDelay,
});
await pMap(items, async (item) => throttle(transform)(item));
// With p-limit:
const limit = pLimit(concurrentTasks);
await Promise.all(items.map(item =>
limit(() => new Promise(r => setTimeout(() => r(transform(item)), throttleDelay)))
));
```
3. **fast-copy** instead of JSON.parse/stringify
- 2-3x faster than JSON method
- Handles circular references
- Preserves prototypes
- Example conversion:
```typescript
// Instead of:
const copy = JSON.parse(JSON.stringify(obj));
// With fast-copy:
import copy from 'fast-copy';
const objCopy = copy(obj);
```
4. **object-path** instead of custom path traversal
- Well-tested library for object access by path
- Simpler error handling
- Better performance
- Example conversion:
```typescript
// Instead of custom path traversal:
let current = obj;
for (const key of keys) {
if (current[key] === undefined) return;
current = current[key];
}
// With object-path:
import objectPath from 'object-path';
const value = objectPath.get(obj, path);
objectPath.set(obj, path, newValue);
```
5. **oazapfts** or **openapi-typescript** for LLM API clients
- Type-safe API clients generated from OpenAPI specs
- Consistent error handling
- Proper request/response typing
- Example:
```typescript
import { createClient } from './generated/openai-client';
const client = createClient({
apiKey: process.env.OPENAI_API_KEY,
});
const response = await client.createChatCompletion({
model: 'gpt-4',
messages: [{ role: 'user', content: prompt }]
});
```
## Enhanced Interface Suggestions
```typescript
// Strongly typed transform function
export async function transform<T extends Record<string, any>>(
data: T,
options: {
paths: {
source: string;
target?: string;
prompt: string;
}[];
model?: string;
router?: string;
cache?: boolean | Partial<CacheConfig>;
concurrency?: number;
logger?: Partial<ILogger>;
}
): Promise<T>;
// Simplified usage example:
const result = await transform(myData, {
paths: [
{
source: '$.description',
prompt: 'Make this more engaging'
},
{
source: '$.title',
target: 'seoTitle',
prompt: 'Create an SEO-optimized version'
}
],
model: 'gpt-4',
concurrency: 5
});
```

View File

@ -3,7 +3,7 @@
"messages": [
{
"role": "user",
"content": "Generate a more appealing marketing name for this product\n\nText to transform: \"banana\""
"content": "Generate a more appealing marketing name for this product\n\nText to transform: \"apple2\""
},
{
"role": "user",

View File

@ -1,6 +1,7 @@
import { JSONPath } from 'jsonpath-plus'
import pThrottle from 'p-throttle'
import pMap from 'p-map'
import { deepClone } from "@polymech/core/objects"
export type AsyncTransformer = (input: string, path: string) => Promise<string>
export type ErrorCallback = (path: string, value: string, error: unknown) => void
@ -201,7 +202,7 @@ export async function transformObjectWithOptions(
}
// For targetPath case, create a deep clone and transform it
const dataCopy = JSON.parse(JSON.stringify(obj));
const dataCopy = deepClone(obj)
// Transform the copy
await transformObject(

View File

@ -6,6 +6,13 @@ import { E_OPENROUTER_MODEL } from '../../models/cache/openrouter-models.js';
import { E_Mode } from '../../zod_schema.js';
import { FieldMapping, createIterator, createLLMTransformer, CacheConfig } from '../../iterator.js';
/**
* Notes for LLM modifications
*
* - to test it, use `npm run examples:iterator-factory`
*/
const MODEL = E_OPENROUTER_MODEL.MODEL_OPENROUTER_QUASAR_ALPHA;
const ROUTER = 'openrouter';
const LOG_LEVEL = 2;
@ -24,12 +31,12 @@ const exampleData = {
fruits: [
{
id: 'f1',
name: 'apple',
name: 'apple2',
description: 'A sweet and crunchy fruit',
details: {
color: 'red',
origin: 'Worldwide',
nutrition: 'Rich in fiber and vitamin C'
nutrition: 'Rich in fiber and vitamin D'
}
},
{

View File

@ -4,6 +4,13 @@ import { run } from './commands/run.js'
import { get_cached_object, set_cached_object, rm_cached_object } from "@polymech/cache"
import { deepClone } from "@polymech/core/objects"
/**
* Notes for LLM modifications
*
* - this is a wrapper around the async-iterator.ts file, implementing application layer caching and other features
* - to test it, use `npm run examples:iterator-factory`
*/
export interface ILogger {
info: (message: string) => void;
warn: (message: string) => void;
@ -247,14 +254,14 @@ export async function transformWithMappings(
createTransformer: (options: IKBotTask) => AsyncTransformer,
mappings: FieldMapping[],
globalOptions: {
throttleDelay?: number
concurrentTasks?: number
errorCallback?: ErrorCallback
filterCallback?: FilterCallback
maxRetries?: number
retryDelay?: number
logger?: { error: (message: string, error?: any) => void }
cacheConfig?: CacheConfig
throttleDelay?: number;
concurrentTasks?: number;
errorCallback?: ErrorCallback;
filterCallback?: FilterCallback;
maxRetries?: number;
retryDelay?: number;
logger?: ILogger;
cacheConfig?: CacheConfig;
} = {}
): Promise<void> {
const iterator = createIterator(obj, {}, globalOptions);

View File

@ -3,14 +3,14 @@
"fruits": [
{
"id": "f1",
"name": "apple",
"name": "apple2",
"description": "A deliciously sweet fruit with a satisfying crunch, bursting with juicy flavor and vibrant freshness in every bite, perfect for a refreshing and healthy snack.",
"details": {
"color": "red",
"origin": "Worldwide",
"nutrition": "Rich in fiber and vitamin C, this food supports digestive health by promoting regularity, boosts immune function, and provides antioxidant protection to help reduce inflammation and support overall wellness."
"nutrition": "Rich in fiber and vitamin D, this promotes digestive health, supports gut microbiome balance, and helps maintain strong bones and immune function through enhanced calcium absorption and regulation of immune responses."
},
"marketingName": "Crimson Orchard Delight"
"marketingName": "Apple Fusion"
},
{
"id": "f2",