JCV's Portfolio

packages/chat-preprocess-cli/src/tasks/resume-pdf.ts

import { promises as fs } from 'node:fs'; import path from 'node:path'; import type { JsonSchema, LlmClient } from '@portfolio/chat-llm'; import { PreprocessError, PREPROCESS_ERROR_CODES } from '../errors'; import { requireEnv } from '../env'; import type { PreprocessMetrics } from '../metrics'; import type { PreprocessContext, PreprocessTaskResult } from '../types'; import { normalizeDistinctStrings } from '../utils'; import { getPreprocessLlmClient } from '../llm'; import type { RawAward, RawEducation, RawExperience, RawSkill, ResumeSource } from './resume'; import { detectExperienceType } from './resume';

// pdf-parse expects DOM-like globals in Node; provide minimal stubs to avoid ReferenceErrors. const globalAny = globalThis as Record<string, unknown>; if (typeof globalAny.DOMMatrix === 'undefined') { globalAny.DOMMatrix = class DOMMatrix { }; } if (typeof globalAny.ImageData === 'undefined') { globalAny.ImageData = class ImageData { }; } if (typeof globalAny.Path2D === 'undefined') { globalAny.Path2D = class Path2D { }; }

type PdfParseFn = typeof import('pdf-parse').PDFParse; let pdfParseFn: PdfParseFn | null = null; async function getPdfParse(): Promise { if (!pdfParseFn) { const mod = await import('pdf-parse'); pdfParseFn = mod.PDFParse; } return pdfParseFn; }

const LOG_PREFIX = '[resume-pdf]'; const MAX_PROMPT_CHARS = 12000;

type ExtractedExperience = { id?: string | null; company?: string | null; title?: string | null; location?: string | null; startDate?: string | null; endDate?: string | null; summary?: string | null; bullets?: string[]; skills?: string[]; linkedProjects?: string[]; };

type ResumeExtraction = { experiences: ExtractedExperience[]; education?: Array<Partial>; awards?: Array<Partial>; skills?: Array<Partial>; };

function cleanResumeText(raw: string): string { return raw .replace(/\u00a0/g, ' ') .replace(/[ \t]+/g, ' ') .replace(/ + /g, ' ') .replace(/ {3,}/g, '

') .trim(); }

function truncateForPrompt(text: string): string { if (text.length <= MAX_PROMPT_CHARS) { return text; } return `${text.slice(0, MAX_PROMPT_CHARS)}

[truncated ${text.length - MAX_PROMPT_CHARS} chars]`; }

function dedupeMerge(primary?: string[], secondary?: string[]): string[] { return normalizeDistinctStrings([...(primary ?? []), ...(secondary ?? [])]); }

let normalizedEnd: string | null = null; try { normalizedEnd = coerceEndDate(raw.endDate); } catch (error) { console.warn(${LOG_PREFIX} Ignoring invalid end date for ${company} (${title}):, error); }

const { id: _rawId, location, ...rest } = raw; const normalizedLocation = location?.trim() || undefined; const sanitizedBullets = normalizeDistinctStrings(raw.bullets); const sanitizedSkills = normalizeDistinctStrings(raw.skills); const sanitizedLinked = normalizeDistinctStrings(raw.linkedProjects); const sanitizedSummary = raw.summary?.trim() || undefined; const experienceType = detectExperienceType({ ...rest, company, title, startDate: normalizedStart, endDate: normalizedEnd ?? undefined, location: normalizedLocation, bullets: sanitizedBullets, skills: sanitizedSkills, linkedProjects: sanitizedLinked, summary: sanitizedSummary, });

return { id: raw.id?.trim() || undefined, company, title, location: normalizedLocation, startDate: normalizedStart, endDate: normalizedEnd, summary: sanitizedSummary, bullets: sanitizedBullets, skills: sanitizedSkills, linkedProjects: sanitizedLinked, experienceType, }; }

function sanitizeEducation(raw: Partial): RawEducation | null { const institution = raw.institution?.trim(); if (!institution) return null; return { id: raw.id?.trim(), institution, degree: raw.degree?.trim(), field: raw.field?.trim(), location: raw.location?.trim(), startDate: raw.startDate?.trim(), endDate: raw.endDate?.trim() ?? null, summary: raw.summary?.trim(), bullets: normalizeDistinctStrings(raw.bullets), skills: normalizeDistinctStrings(raw.skills), }; }

function sanitizeAward(raw: Partial): RawAward | null { const title = raw.title?.trim(); if (!title) return null; return { id: raw.id?.trim(), title, issuer: raw.issuer?.trim(), date: raw.date?.trim(), summary: raw.summary?.trim(), bullets: normalizeDistinctStrings(raw.bullets), skills: normalizeDistinctStrings(raw.skills), }; }

function sanitizeSkill(raw: Partial): RawSkill | null { const name = raw.name?.trim(); if (!name) return null; return { id: raw.id?.trim(), name, category: raw.category?.trim(), summary: raw.summary?.trim(), skills: normalizeDistinctStrings(raw.skills), }; }

function makeMatchKey(exp: Partial): string { return [exp.id?.toLowerCase() ?? '', exp.company?.toLowerCase() ?? '', exp.title?.toLowerCase() ?? ''].join('::'); }

async function readExistingResume(filePath: string): Promise<ResumeSource | null> { try { const raw = await fs.readFile(filePath, 'utf-8'); return JSON.parse(raw) as ResumeSource; } catch (error) { if ((error as NodeJS.ErrnoException).code === 'ENOENT') { return null; } throw error; } }

function mergeWithExisting(experiences: RawExperience[], existing: ResumeSource | null): RawExperience[] { if (!existing) { return experiences; } const existingByKey = new Map<string, RawExperience>(); for (const exp of existing.experiences) { existingByKey.set(makeMatchKey(exp), exp); }

return experiences.map((exp) => { const match = existingByKey.get(makeMatchKey(exp)); if (!match) { return exp; } const bullets = exp.bullets ?? []; const skills = exp.skills ?? []; return { ...exp, id: exp.id ?? match.id, summary: exp.summary ?? match.summary, bullets: bullets.length ? bullets : match.bullets, skills: skills.length ? dedupeMerge(skills, match.skills) : match.skills, linkedProjects: dedupeMerge(exp.linkedProjects, match.linkedProjects), location: exp.location ?? match.location, endDate: exp.endDate ?? match.endDate ?? null, }; }); }

function mergeSections<T extends { id?: string | null }>(incoming: T[] | undefined, existing: T[] | undefined): T[] { const safeIncoming = incoming?.filter(Boolean) ?? []; if (!existing?.length) return safeIncoming; if (!safeIncoming.length) return existing; const byId = new Map<string, T>(); for (const item of existing) { const key = (item.id ?? '').toString().toLowerCase(); if (key) byId.set(key, item); } return safeIncoming.map((item) => { const key = (item.id ?? '').toString().toLowerCase(); return key && byId.has(key) ? { ...byId.get(key), ...item } : item; }); }

async function extractExperiencesFromPdf( llm: LlmClient, pdfPath: string, model: string, metrics?: PreprocessMetrics ): Promise<{ experiences: RawExperience[]; education: RawEducation[]; awards: RawAward[]; skills: RawSkill[]; }> { const pdfBuffer = await fs.readFile(pdfPath); const PdfParseCtor = await getPdfParse(); const parser = new PdfParseCtor({ data: pdfBuffer }); let cleaned = ''; try { const result = await parser.getText(); cleaned = cleanResumeText(result.text ?? ''); } finally { await parser.destroy().catch(() => undefined); } if (!cleaned) { throw new PreprocessError(PREPROCESS_ERROR_CODES.PDF_UNREADABLE, 'Unable to extract text from resume PDF.'); }

const truncated = truncateForPrompt(cleaned);

const schema: JsonSchema = { type: 'json_schema', name: 'ResumeExtraction', strict: true, schema: { type: 'object', additionalProperties: false, required: ['experiences', 'education', 'awards', 'skills'], properties: { experiences: { type: 'array', minItems: 1, items: { type: 'object', additionalProperties: false, required: [ 'id', 'company', 'title', 'location', 'startDate', 'endDate', 'summary', 'bullets', 'skills', 'linkedProjects', ], properties: { id: { type: ['string', 'null'], description: 'Optional stable identifier. Use a slug-friendly string if available.', }, company: { type: 'string' }, title: { type: 'string' }, location: { type: ['string', 'null'] }, startDate: { type: 'string', description: 'ISO 8601 date (YYYY-MM-01). Use the first day of the month when day is missing.', }, endDate: { anyOf: [{ type: 'string' }, { type: 'null' }], description: 'ISO 8601 date or null if the role is ongoing.', }, summary: { type: ['string', 'null'], description: 'One-sentence summary highlighting the impact of this role.', }, bullets: { type: 'array', items: { type: 'string' }, description: 'Up to 4 concrete accomplishment bullet points.', maxItems: 4, }, skills: { type: 'array', items: { type: 'string' }, description: 'Technologies, platforms, or methodologies explicitly tied to this role.', }, linkedProjects: { type: 'array', items: { type: 'string' }, description: 'Portfolio project slugs that should be associated with this experience.', }, }, }, }, education: { type: 'array', items: { type: 'object', additionalProperties: false, properties: { id: { type: ['string', 'null'] }, institution: { type: 'string' }, degree: { type: ['string', 'null'] }, field: { type: ['string', 'null'] }, location: { type: ['string', 'null'] }, startDate: { type: ['string', 'null'] }, endDate: { type: ['string', 'null'] }, summary: { type: ['string', 'null'] }, bullets: { type: 'array', items: { type: 'string' }, maxItems: 4 }, skills: { type: 'array', items: { type: 'string' } }, }, required: [ 'id', 'institution', 'degree', 'field', 'location', 'startDate', 'endDate', 'summary', 'bullets', 'skills', ], }, }, awards: { type: 'array', items: { type: 'object', additionalProperties: false, properties: { id: { type: ['string', 'null'] }, title: { type: 'string' }, issuer: { type: ['string', 'null'] }, date: { type: ['string', 'null'] }, summary: { type: ['string', 'null'] }, bullets: { type: 'array', items: { type: 'string' }, maxItems: 4 }, skills: { type: 'array', items: { type: 'string' } }, }, required: ['id', 'title', 'issuer', 'date', 'summary', 'bullets', 'skills'], }, }, skills: { type: 'array', items: { type: 'object', additionalProperties: false, properties: { id: { type: ['string', 'null'] }, name: { type: 'string' }, category: { type: ['string', 'null'] }, summary: { type: ['string', 'null'] }, skills: { type: 'array', items: { type: 'string' } }, }, required: ['id', 'name', 'category', 'summary', 'skills'], }, }, }, }, };

const systemPrompt = 'You are a meticulous resume parser. Convert resume text into structured resume objects (experiences, education, awards, skills). Preserve factual bullet points and keep wording concise. Dates must be ISO formatted (YYYY-MM-DD).'; const userPrompt = Resume text: ${truncated};

const response = await (metrics ? metrics.wrapLlm({ stage: 'other', model, meta: { pdf: path.basename(pdfPath) } }, () => llm.createStructuredJson({ model, systemPrompt, userContent: userPrompt, jsonSchema: schema, stage: 'resume_pdf', }) ) : llm.createStructuredJson({ model, systemPrompt, userContent: userPrompt, jsonSchema: schema, stage: 'resume_pdf', }));

export async function runResumePdfTask(context: PreprocessContext): Promise { const pdfPath = context.paths.resumePdf; const outputPath = context.paths.resumeJson; const rootDir = context.paths.rootDir;

await fs.access(pdfPath).catch(() => { throw new PreprocessError( PREPROCESS_ERROR_CODES.PDF_NOT_FOUND, Resume PDF not found at ${path.relative(rootDir, pdfPath)} ); });

// Ensure provider API key is present (OPENAI_API_KEY or ANTHROPIC_API_KEY). if (context.config.provider === 'openai') { requireEnv('OPENAI_API_KEY', 'OPENAI_API_KEY is required for resume ingestion'); } else { requireEnv('ANTHROPIC_API_KEY', 'ANTHROPIC_API_KEY is required for resume ingestion when provider=anthropic'); } const llm = getPreprocessLlmClient(context.config.provider); const { resumeTextModel } = context.models;

const [extracted, existingPrimary] = await Promise.all([ extractExperiencesFromPdf(llm, pdfPath, resumeTextModel, context.metrics), readExistingResume(outputPath), ]); const existing = existingPrimary; if (!extracted.experiences.length) { throw new PreprocessError(PREPROCESS_ERROR_CODES.PDF_EMPTY, 'Resume PDF did not yield any experiences.'); }

const mergedExperiences = mergeWithExisting(extracted.experiences, existing); const mergedEducation = mergeSections(extracted.education, existing?.education); const mergedAwards = mergeSections(extracted.awards, existing?.awards); const mergedSkills = mergeSections(extracted.skills, existing?.skills); const snapshotDate = new Date().toISOString().split('T')[0] ?? 'unspecified'; const payload: ResumeSource = { snapshotDate, experiences: mergedExperiences, education: mergedEducation, awards: mergedAwards, skills: mergedSkills, };

const artifact = await context.artifacts.writeJson({ id: 'resume-raw', filePath: outputPath, data: payload, });

return { description: Extracted ${mergedExperiences.length} experiences from resume PDF, counts: [ { label: 'Experiences', value: mergedExperiences.length }, { label: 'Education', value: mergedEducation.length }, { label: 'Awards', value: mergedAwards.length }, { label: 'Skills', value: mergedSkills.length }, ], artifacts: [{ path: artifact.relativePath, note: snapshotDate }], }; }