import path from 'node:path'; import { promises as fs } from 'node:fs'; import OpenAI from 'openai'; import { PreprocessError, PREPROCESS_ERROR_CODES } from '../errors'; import { requireEnv } from '../env'; import type { PreprocessContext, PreprocessTaskResult } from '../types'; import type { NormalizedAward, NormalizedEducation, NormalizedExperience, NormalizedSkill } from './resume';
type ResumeDataset = { experiences: NormalizedExperience[]; education?: NormalizedEducation[]; awards?: NormalizedAward[]; skills?: NormalizedSkill[]; };
function formatTimeframe(start?: string, end?: string | null): string {
if (!start) return end ?? 'present';
const safeEnd = end ?? 'present';
return ${start} → ${safeEnd};
}
function buildExperienceEmbeddingInput(experience: NormalizedExperience): string {
const parts = [
${experience.company} — ${experience.title},
experience.location ? Location: ${experience.location} : '',
Timeframe: ${formatTimeframe(experience.startDate, experience.endDate)},
experience.summary ?? '',
experience.bullets.length ? Highlights: ${experience.bullets.join(' • ')} : '',
experience.skills.length ? Skills: ${experience.skills.join(', ')} : '',
];
return parts
.map((part) => part.trim())
.filter((part) => part.length > 0)
.join('
');
}
function buildEducationEmbeddingInput(edu: NormalizedEducation): string {
const parts = [
${edu.institution} — ${[edu.degree, edu.field].filter(Boolean).join(' ')}.trim(),
edu.location ? Location: ${edu.location} : '',
edu.startDate || edu.endDate ? Timeframe: ${formatTimeframe(edu.startDate, edu.endDate)} : '',
edu.summary ?? '',
(edu.bullets ?? []).length ? Highlights: ${(edu.bullets ?? []).join(' • ')} : '',
(edu.skills ?? []).length ? Skills: ${(edu.skills ?? []).join(', ')} : '',
];
return parts
.map((part) => part.trim())
.filter((part) => part.length > 0)
.join('
');
}
function buildAwardEmbeddingInput(award: NormalizedAward): string {
const parts = [
${award.title}${award.issuer ? — ${award.issuer} : ''},
award.date ? Date: ${award.date} : '',
award.summary ?? '',
(award.bullets ?? []).length ? Highlights: ${(award.bullets ?? []).join(' • ')} : '',
(award.skills ?? []).length ? Skills: ${(award.skills ?? []).join(', ')} : '',
];
return parts
.map((part) => part.trim())
.filter((part) => part.length > 0)
.join('
');
}
function buildSkillEmbeddingInput(skill: NormalizedSkill): string {
const parts = [
skill.name,
skill.category ? Category: ${skill.category} : '',
skill.summary ?? '',
(skill.skills ?? []).length ? Related: ${(skill.skills ?? []).join(', ')} : '',
];
return parts
.map((part) => (part ?? '').trim())
.filter((part) => part.length > 0)
.join('
');
}
async function loadResume(filePath: string): Promise { const raw = await fs.readFile(filePath, 'utf-8'); const parsed = JSON.parse(raw) as ResumeDataset; if (!Array.isArray(parsed.experiences)) { throw new PreprocessError( PREPROCESS_ERROR_CODES.RESUME_SOURCE_INVALID, 'Resume dataset must include an experiences array' ); } return { experiences: parsed.experiences, education: parsed.education ?? [], awards: parsed.awards ?? [], skills: parsed.skills ?? [], }; }
function relPath(context: PreprocessContext, filePath: string): string { return path.relative(context.paths.rootDir, filePath); }
export async function runExperienceEmbeddingsTask(context: PreprocessContext): Promise { const openAiKey = requireEnv('OPENAI_API_KEY'); const datasetPath = context.paths.experiencesOutput; const outputPath = context.paths.resumeEmbeddingsOutput; const client = new OpenAI({ apiKey: openAiKey }); const { resumeEmbeddingModel } = context.models;
const exists = await fs
.access(datasetPath)
.then(() => true)
.catch(() => false);
if (!exists) {
throw new PreprocessError(
PREPROCESS_ERROR_CODES.NO_RESUME,
Experience dataset not found at ${relPath(context, datasetPath)}
);
}
const dataset = await loadResume(datasetPath); const allEntries = [ ...dataset.experiences, ...(dataset.education ?? []), ...(dataset.awards ?? []), ...(dataset.skills ?? []), ];
if (!allEntries.length) { const buildId = new Date().toISOString(); const emptyIndex = { meta: { schemaVersion: 1, buildId, }, entries: [] as Array<{ id: string; vector: number[] }>, }; const artifact = await context.artifacts.writeJson({ id: 'resume-embeddings', filePath: outputPath, data: emptyIndex, }); return { description: 'No experiences found. Wrote empty embeddings file.', counts: [{ label: 'Embeddings', value: 0 }], artifacts: [{ path: artifact.relativePath, note: '0 vectors' }], }; }
const embeddings: Array<{ id: string; vector: number[] }> = []; const payloads = allEntries.map((entry) => { const entryType = (entry as { type?: string }).type; if (entryType === 'skill') { return { id: entry.id, payload: buildSkillEmbeddingInput(entry as NormalizedSkill) }; } if (entryType === 'education') { return { id: entry.id, payload: buildEducationEmbeddingInput(entry as NormalizedEducation) }; } if (entryType === 'award') { return { id: entry.id, payload: buildAwardEmbeddingInput(entry as NormalizedAward) }; } if ('company' in entry) { return { id: entry.id, payload: buildExperienceEmbeddingInput(entry as NormalizedExperience) }; } if ('institution' in entry) { return { id: entry.id, payload: buildEducationEmbeddingInput(entry as NormalizedEducation) }; } if ('issuer' in entry) { return { id: entry.id, payload: buildAwardEmbeddingInput(entry as NormalizedAward) }; } return { id: entry.id, payload: buildSkillEmbeddingInput(entry as NormalizedSkill) }; });
const BATCH_SIZE = 32;
for (let idx = 0; idx < payloads.length; idx += BATCH_SIZE) {
const batch = payloads.slice(idx, idx + BATCH_SIZE);
const response = await context.metrics.wrapLlm(
{ stage: 'other', model: resumeEmbeddingModel, meta: { batchSize: batch.length } },
() =>
client.embeddings.create({
model: resumeEmbeddingModel,
input: batch.map((item) => item.payload),
})
);
response.data.forEach((row, rowIdx) => {
const record = batch[rowIdx];
embeddings.push({
id: record?.id ?? entry-${idx + rowIdx},
vector: row?.embedding ?? [],
});
});
}
const buildId = new Date().toISOString(); const embeddingIndex = { meta: { schemaVersion: 1, buildId, }, entries: embeddings, };
const artifact = await context.artifacts.writeJson({ id: 'resume-embeddings', filePath: outputPath, data: embeddingIndex, });
return {
description: Generated ${embeddings.length} resume embeddings,
counts: [
{ label: 'Experiences', value: dataset.experiences.length },
{ label: 'Education', value: dataset.education?.length ?? 0 },
{ label: 'Awards', value: dataset.awards?.length ?? 0 },
{ label: 'Skills', value: dataset.skills?.length ?? 0 },
{ label: 'Embeddings', value: embeddings.length },
],
artifacts: [{ path: artifact.relativePath, note: ${embeddings.length} vectors }],
};
}
