You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

240 lines
6.2 KiB

import { DataSource } from '@prisma/client';
import { AiService } from '../ai.service';
const DEFAULT_BENCHMARK_ITERATIONS = 3;
const DEFAULT_ALLOWED_FAILURES = 1;
const LIVE_SINGLE_TOOL_TARGET_IN_MS = 5_000;
const LIVE_MULTI_STEP_TARGET_IN_MS = 15_000;
function hasLiveProviderKey() {
return Boolean(
process.env.z_ai_glm_api_key ||
process.env.Z_AI_GLM_API_KEY ||
process.env.minimax_api_key ||
process.env.MINIMAX_API_KEY
);
}
function parseIntegerEnv(name: string, fallback: number) {
const parsed = Number.parseInt(process.env[name] ?? '', 10);
return Number.isFinite(parsed) && parsed > 0 ? parsed : fallback;
}
function percentile(values: number[], quantile: number) {
const sortedValues = [...values].sort((a, b) => a - b);
if (sortedValues.length === 0) {
return 0;
}
const index = Math.min(
sortedValues.length - 1,
Math.ceil(sortedValues.length * quantile) - 1
);
return sortedValues[index];
}
function createLiveBenchmarkSubject() {
const dataProviderService = {
getQuotes: jest.fn().mockImplementation(async () => {
return {
AAPL: {
currency: 'USD',
marketPrice: 212.34,
marketState: 'REGULAR'
},
MSFT: {
currency: 'USD',
marketPrice: 451.2,
marketState: 'REGULAR'
},
NVDA: {
currency: 'USD',
marketPrice: 905.7,
marketState: 'REGULAR'
}
};
})
};
const portfolioService = {
getDetails: jest.fn().mockResolvedValue({
holdings: {
AAPL: {
allocationInPercentage: 0.52,
dataSource: DataSource.YAHOO,
symbol: 'AAPL',
valueInBaseCurrency: 5200
},
MSFT: {
allocationInPercentage: 0.28,
dataSource: DataSource.YAHOO,
symbol: 'MSFT',
valueInBaseCurrency: 2800
},
NVDA: {
allocationInPercentage: 0.2,
dataSource: DataSource.YAHOO,
symbol: 'NVDA',
valueInBaseCurrency: 2000
}
}
})
};
const propertyService = {
getByKey: jest.fn()
};
const redisCacheService = {
get: jest.fn().mockResolvedValue(undefined),
set: jest.fn().mockResolvedValue(undefined)
};
const aiObservabilityService = {
captureChatFailure: jest.fn().mockResolvedValue(undefined),
captureChatSuccess: jest.fn().mockResolvedValue({
latencyBreakdownInMs: {
llmGenerationInMs: 0,
memoryReadInMs: 0,
memoryWriteInMs: 0,
toolExecutionInMs: 0
},
latencyInMs: 0,
tokenEstimate: {
input: 0,
output: 0,
total: 0
},
traceId: 'live-benchmark'
}),
recordLlmInvocation: jest.fn().mockResolvedValue(undefined),
recordFeedback: jest.fn().mockResolvedValue(undefined)
};
return new AiService(
dataProviderService as never,
portfolioService as never,
propertyService as never,
redisCacheService as never,
aiObservabilityService as never
);
}
async function runLiveBenchmark({
query,
sessionPrefix,
subject
}: {
query: string;
sessionPrefix: string;
subject: AiService;
}) {
const iterations = parseIntegerEnv(
'AI_LIVE_BENCHMARK_ITERATIONS',
DEFAULT_BENCHMARK_ITERATIONS
);
const allowedFailures = parseIntegerEnv(
'AI_LIVE_BENCHMARK_MAX_FAILURES',
DEFAULT_ALLOWED_FAILURES
);
const durationsInMs: number[] = [];
let failures = 0;
for (let index = 0; index < iterations; index++) {
const startedAt = Date.now();
try {
const response = await subject.chat({
languageCode: 'en',
query,
sessionId: `${sessionPrefix}-${index}`,
userCurrency: 'USD',
userId: 'live-benchmark-user'
});
if (response.answer.trim().length === 0) {
failures += 1;
}
} catch {
failures += 1;
} finally {
durationsInMs.push(Date.now() - startedAt);
}
}
const averageInMs =
durationsInMs.reduce((sum, duration) => sum + duration, 0) /
durationsInMs.length;
expect(failures).toBeLessThanOrEqual(allowedFailures);
return {
averageInMs,
failures,
iterations,
p95InMs: percentile(durationsInMs, 0.95)
};
}
const shouldRunLiveBenchmark =
process.env.AI_LIVE_BENCHMARK === 'true' && hasLiveProviderKey();
const describeLiveBenchmark = shouldRunLiveBenchmark ? describe : describe.skip;
describeLiveBenchmark('AiService Live Latency Benchmark', () => {
jest.setTimeout(120_000);
it('captures single-tool live latency metrics', async () => {
const benchmarkResult = await runLiveBenchmark({
query: 'Give me a quick portfolio allocation overview',
sessionPrefix: 'live-single-tool',
subject: createLiveBenchmarkSubject()
});
const shouldEnforceTargets =
process.env.AI_LIVE_BENCHMARK_ENFORCE_TARGETS === 'true';
console.info(
JSON.stringify({
averageInMs: Number(benchmarkResult.averageInMs.toFixed(2)),
failures: benchmarkResult.failures,
iterations: benchmarkResult.iterations,
metric: 'single_tool_live_latency',
p95InMs: benchmarkResult.p95InMs,
targetInMs: LIVE_SINGLE_TOOL_TARGET_IN_MS
})
);
if (shouldEnforceTargets) {
expect(benchmarkResult.p95InMs).toBeLessThanOrEqual(
LIVE_SINGLE_TOOL_TARGET_IN_MS
);
}
});
it('captures multi-step live latency metrics', async () => {
const benchmarkResult = await runLiveBenchmark({
query:
'Rebalance my portfolio, run a stress test, and give market prices for AAPL and MSFT',
sessionPrefix: 'live-multi-step',
subject: createLiveBenchmarkSubject()
});
const shouldEnforceTargets =
process.env.AI_LIVE_BENCHMARK_ENFORCE_TARGETS === 'true';
console.info(
JSON.stringify({
averageInMs: Number(benchmarkResult.averageInMs.toFixed(2)),
failures: benchmarkResult.failures,
iterations: benchmarkResult.iterations,
metric: 'multi_step_live_latency',
p95InMs: benchmarkResult.p95InMs,
targetInMs: LIVE_MULTI_STEP_TARGET_IN_MS
})
);
if (shouldEnforceTargets) {
expect(benchmarkResult.p95InMs).toBeLessThanOrEqual(
LIVE_MULTI_STEP_TARGET_IN_MS
);
}
});
});