Browse Source

Merge pull request #3 from RobertgPatch/005-k1-parser-fix

005 k1 parser fix
pull/6701/head
RobertgPatch 2 months ago
committed by GitHub
parent
commit
fdceaef081
No known key found for this signature in database GPG Key ID: B5690EEEBB952194
  1. 11
      .github/agents/copilot-instructions.md
  2. 55
      .specify/memory/constitution.md
  3. 4
      apps/api/src/app/app.module.ts
  4. 145
      apps/api/src/app/cell-mapping/cell-mapping.controller.ts
  5. 14
      apps/api/src/app/cell-mapping/cell-mapping.module.ts
  6. 467
      apps/api/src/app/cell-mapping/cell-mapping.service.ts
  7. 15
      apps/api/src/app/k1-import/dto/confirm-k1.dto.ts
  8. 10
      apps/api/src/app/k1-import/dto/upload-k1.dto.ts
  9. 28
      apps/api/src/app/k1-import/dto/verify-k1.dto.ts
  10. 302
      apps/api/src/app/k1-import/extractors/azure-extractor.ts
  11. 22
      apps/api/src/app/k1-import/extractors/k1-extractor.interface.ts
  12. 1325
      apps/api/src/app/k1-import/extractors/k1-position-regions.ts
  13. 967
      apps/api/src/app/k1-import/extractors/pdf-parse-extractor.ts
  14. 208
      apps/api/src/app/k1-import/extractors/tesseract-extractor.ts
  15. 96
      apps/api/src/app/k1-import/k1-aggregation.service.ts
  16. 92
      apps/api/src/app/k1-import/k1-allocation.service.ts
  17. 111
      apps/api/src/app/k1-import/k1-confidence.service.ts
  18. 160
      apps/api/src/app/k1-import/k1-field-mapper.service.ts
  19. 138
      apps/api/src/app/k1-import/k1-import.controller.ts
  20. 68
      apps/api/src/app/k1-import/k1-import.module.ts
  21. 903
      apps/api/src/app/k1-import/k1-import.service.ts
  22. 17
      apps/api/src/app/upload/upload.service.ts
  23. 2
      apps/api/src/environments/environment.ts
  24. 2
      apps/api/src/services/configuration/configuration.service.ts
  25. 2
      apps/api/src/services/interfaces/environment.interface.ts
  26. 5
      apps/client/project.json
  27. 14
      apps/client/src/app/app.routes.ts
  28. 42
      apps/client/src/app/components/header/header.component.html
  29. 372
      apps/client/src/app/pages/cell-mapping/cell-mapping-page.component.ts
  30. 207
      apps/client/src/app/pages/cell-mapping/cell-mapping-page.html
  31. 15
      apps/client/src/app/pages/cell-mapping/cell-mapping-page.routes.ts
  32. 258
      apps/client/src/app/pages/cell-mapping/cell-mapping-page.scss
  33. 134
      apps/client/src/app/pages/k-documents/k-document-detail/k-document-detail.component.ts
  34. 106
      apps/client/src/app/pages/k-documents/k-document-detail/k-document-detail.html
  35. 79
      apps/client/src/app/pages/k-documents/k-document-detail/k-document-detail.scss
  36. 2
      apps/client/src/app/pages/k-documents/k-documents-page.component.ts
  37. 9
      apps/client/src/app/pages/k-documents/k-documents-page.routes.ts
  38. 205
      apps/client/src/app/pages/k1-import/k1-confirmation/k1-confirmation.component.ts
  39. 167
      apps/client/src/app/pages/k1-import/k1-confirmation/k1-confirmation.html
  40. 37
      apps/client/src/app/pages/k1-import/k1-confirmation/k1-confirmation.scss
  41. 269
      apps/client/src/app/pages/k1-import/k1-import-page.component.ts
  42. 162
      apps/client/src/app/pages/k1-import/k1-import-page.html
  43. 33
      apps/client/src/app/pages/k1-import/k1-import-page.routes.ts
  44. 47
      apps/client/src/app/pages/k1-import/k1-import-page.scss
  45. 434
      apps/client/src/app/pages/k1-import/k1-verification/k1-verification.component.ts
  46. 264
      apps/client/src/app/pages/k1-import/k1-verification/k1-verification.html
  47. 199
      apps/client/src/app/pages/k1-import/k1-verification/k1-verification.scss
  48. 4
      apps/client/src/app/services/family-office-data.service.ts
  49. 212
      apps/client/src/app/services/k1-import-data.service.ts
  50. 274
      k1-positions-dump.txt
  51. 14
      libs/common/src/lib/dtos/index.ts
  52. 132
      libs/common/src/lib/dtos/k1-import.dto.ts
  53. 14
      libs/common/src/lib/interfaces/index.ts
  54. 134
      libs/common/src/lib/interfaces/k1-import.interface.ts
  55. 587
      libs/ui/src/lib/k-document-form/k-document-form.component.ts
  56. 460
      package-lock.json
  57. 4
      package.json
  58. 93
      prisma/migrations/20260321000000_added_k1_import_tables/migration.sql
  59. 104
      prisma/schema.prisma
  60. 535
      specs/001-family-office-transform/research-normalized-k1-model.md
  61. 37
      specs/004-k1-scan-import/checklists/requirements.md
  62. 525
      specs/004-k1-scan-import/contracts/k1-import-api.md
  63. 300
      specs/004-k1-scan-import/data-model.md
  64. 124
      specs/004-k1-scan-import/plan.md
  65. 126
      specs/004-k1-scan-import/quickstart.md
  66. 205
      specs/004-k1-scan-import/research.md
  67. 220
      specs/004-k1-scan-import/spec.md
  68. 282
      specs/004-k1-scan-import/tasks.md
  69. 36
      specs/005-k1-parser-fix/checklists/requirements.md
  70. 107
      specs/005-k1-parser-fix/contracts/extraction.md
  71. 94
      specs/005-k1-parser-fix/data-model.md
  72. 81
      specs/005-k1-parser-fix/plan.md
  73. 64
      specs/005-k1-parser-fix/quickstart.md
  74. 221
      specs/005-k1-parser-fix/research.md
  75. 202
      specs/005-k1-parser-fix/spec.md
  76. 237
      specs/005-k1-parser-fix/tasks.md
  77. 43
      test/import/ok/sample-k1-digital.txt
  78. 50
      test/import/ok/sample-k1-scanned.txt
  79. 74
      tools/extract-k1-positions.mjs
  80. 427
      tools/test-k1-parse.mjs

11
.github/agents/copilot-instructions.md

@ -1,10 +1,14 @@
# portfolio-management Development Guidelines
Auto-generated from all feature plans. Last updated: 2026-03-16
Auto-generated from all feature plans. Last updated: 2026-03-18
## Active Technologies
- TypeScript 5.9.2, Node.js >= 22.18.0 + Angular 21.1.1, NestJS 11.1.14, Angular Material 21.1.1, Prisma 6.19.0, big.js, date-fns 4.1.0 (003-portfolio-performance-views)
- PostgreSQL via Prisma ORM (003-portfolio-performance-views)
- TypeScript 5.9.2, Node.js ≥ 22.18.0 + NestJS 11.x (backend), Angular 21.x (frontend), Prisma 6.x (ORM), pdf-parse (PDF text), @azure/ai-form-recognizer (cloud OCR), tesseract.js (local OCR fallback) (004-k1-scan-import)
- PostgreSQL via Prisma (structured data), local filesystem `uploads/` (PDF files) (004-k1-scan-import)
- TypeScript 5.x (Node.js runtime) + NestJS 11.x, pdfjs-dist 5.4.x (already installed via pdf-parse), pdf-parse 2.4.x (kept for `isDigitalK1` detection) (005-k1-parser-fix)
- PostgreSQL via Prisma ORM (existing K1ImportSession, Document tables) (005-k1-parser-fix)
- TypeScript 5.9.2, Node.js ≥22.18.0 + NestJS 11.1.14 (API), Angular 21.1.1 + Angular Material 21.1.1 (client), Prisma 6.19.0 (ORM), Nx 22.5.3 (monorepo), big.js (decimal math), date-fns 4.1.0, chart.js 4.5.1, Bull 4.16.5 (job queues), Redis (caching), yahoo-finance2 3.13.2 (001-family-office-transform)
@ -25,9 +29,10 @@ npm test; npm run lint
TypeScript 5.9.2, Node.js ≥22.18.0: Follow standard conventions
## Recent Changes
- 003-portfolio-performance-views: Added TypeScript 5.9.2, Node.js >= 22.18.0 + Angular 21.1.1, NestJS 11.1.14, Angular Material 21.1.1, Prisma 6.19.0, big.js, date-fns 4.1.0
- 005-k1-parser-fix: Added TypeScript 5.x (Node.js runtime) + NestJS 11.x, pdfjs-dist 5.4.x (already installed via pdf-parse), pdf-parse 2.4.x (kept for `isDigitalK1` detection)
- 004-k1-scan-import: Added TypeScript 5.9.2, Node.js ≥ 22.18.0 + NestJS 11.x (backend), Angular 21.x (frontend), Prisma 6.x (ORM), pdf-parse (PDF text), @azure/ai-form-recognizer (cloud OCR), tesseract.js (local OCR fallback)
- 004-k1-scan-import: Added TypeScript 5.9.2, Node.js ≥ 22.18.0 + NestJS 11.x (backend), Angular 21.x (frontend), Prisma 6.x (ORM), pdf-parse (PDF text), @azure/ai-form-recognizer (cloud OCR), tesseract.js (local OCR fallback)
- 001-family-office-transform: Added TypeScript 5.9.2, Node.js ≥22.18.0 + NestJS 11.1.14 (API), Angular 21.1.1 + Angular Material 21.1.1 (client), Prisma 6.19.0 (ORM), Nx 22.5.3 (monorepo), big.js (decimal math), date-fns 4.1.0, chart.js 4.5.1, Bull 4.16.5 (job queues), Redis (caching), yahoo-finance2 3.13.2
<!-- MANUAL ADDITIONS START -->
<!-- MANUAL ADDITIONS END -->

55
.specify/memory/constitution.md

@ -0,0 +1,55 @@
# Ghostfolio Constitution
## Core Principles
### I. Nx Monorepo Structure
Ghostfolio uses an Nx monorepo with apps (`api`, `client`) and libs (`common`, `ui`). Features must respect project boundaries:
- `@ghostfolio/common` — shared interfaces, types, constants (no framework dependencies)
- `@ghostfolio/ui` — shared Angular UI components
- `@ghostfolio/api` — NestJS backend services, controllers, modules
- `@ghostfolio/client` — Angular frontend pages, services, components
### II. NestJS Module Pattern
Backend features are organized as NestJS modules with:
- Module file registering providers, controllers, imports, exports
- Controller for HTTP endpoints (no business logic)
- Service for business logic
- Interfaces in `@ghostfolio/common` for shared types
### III. Prisma Data Layer
Database access uses Prisma ORM exclusively. Schema changes require migrations. No direct SQL queries. The `PrismaService` is injected via `PrismaModule`.
### IV. TypeScript Strict Conventions
- `noUnusedLocals: true`, `noUnusedParameters: true` — no dead code allowed
- `esModuleInterop: true` — use default imports for CommonJS modules
- Path aliases: `@ghostfolio/api/*`, `@ghostfolio/common/*`, `@ghostfolio/client/*`, `@ghostfolio/ui/*`
### V. Simplicity First
- Start with the simplest solution that works
- YAGNI — don't add abstractions until needed
- Prefer modifying existing files over creating new architectural layers
- Maximum 3 Nx projects per feature (api + common is typical, client when UI needed)
### VI. Interface-First Design
- Shared interfaces live in `@ghostfolio/common`
- API endpoints return typed DTOs
- Feature contracts defined before implementation
## Additional Constraints
- **Angular 21+**: Standalone components, signals preferred
- **NestJS 11+**: Module-based DI, versioned API (URI-based v1)
- **Testing**: Jest for unit/integration tests
- **Docker**: Development via docker-compose (PostgreSQL 5434, Redis 6380)
## Governance
Constitution principles guide all feature development. Complexity beyond these patterns must be justified in the plan's Complexity Tracking table.
**Version**: 1.0.0 | **Ratified**: 2026-03-18 | **Last Amended**: 2026-03-18

4
apps/api/src/app/app.module.ts

@ -55,6 +55,8 @@ import { FamilyOfficeModule } from './family-office/family-office.module';
import { HealthModule } from './health/health.module';
import { ImportModule } from './import/import.module';
import { InfoModule } from './info/info.module';
import { CellMappingModule } from './cell-mapping/cell-mapping.module';
import { K1ImportModule } from './k1-import/k1-import.module';
import { KDocumentModule } from './k-document/k-document.module';
import { LogoModule } from './logo/logo.module';
import { PartnershipModule } from './partnership/partnership.module';
@ -129,6 +131,8 @@ import { UserModule } from './user/user.module';
HealthModule,
ImportModule,
InfoModule,
CellMappingModule,
K1ImportModule,
KDocumentModule,
LogoModule,
MarketDataModule,

145
apps/api/src/app/cell-mapping/cell-mapping.controller.ts

@ -0,0 +1,145 @@
import { HasPermission } from '@ghostfolio/api/decorators/has-permission.decorator';
import { HasPermissionGuard } from '@ghostfolio/api/guards/has-permission.guard';
import { permissions } from '@ghostfolio/common/permissions';
import {
Body,
Controller,
Delete,
Get,
Patch,
Put,
Query,
UseGuards
} from '@nestjs/common';
import { AuthGuard } from '@nestjs/passport';
import { CellMappingService } from './cell-mapping.service';
@Controller('cell-mapping')
export class CellMappingController {
public constructor(
private readonly cellMappingService: CellMappingService
) {}
/**
* GET /api/v1/cell-mapping
* Get cell mappings for a partnership (with global defaults).
*/
@HasPermission(permissions.readKDocument)
@Get()
@UseGuards(AuthGuard('jwt'), HasPermissionGuard)
public async getMappings(
@Query('partnershipId') partnershipId?: string
) {
return this.cellMappingService.getMappings(partnershipId);
}
/**
* PUT /api/v1/cell-mapping
* Update or create cell mappings for a partnership.
*/
@HasPermission(permissions.updateKDocument)
@Put()
@UseGuards(AuthGuard('jwt'), HasPermissionGuard)
public async updateMappings(
@Body()
data: {
partnershipId: string;
mappings: Array<{
boxNumber: string;
label: string;
description?: string;
cellType?: string;
isCustom: boolean;
}>;
}
) {
return this.cellMappingService.updateMappings(
data.partnershipId,
data.mappings
);
}
/**
* DELETE /api/v1/cell-mapping/reset
* Reset a partnership's cell mappings to IRS defaults.
*/
@HasPermission(permissions.updateKDocument)
@Delete('reset')
@UseGuards(AuthGuard('jwt'), HasPermissionGuard)
public async resetMappings(
@Query('partnershipId') partnershipId: string
) {
return this.cellMappingService.resetMappings(partnershipId);
}
/**
* PATCH /api/v1/cell-mapping/toggle-ignored
* Toggle the isIgnored flag for a specific cell mapping.
*/
@HasPermission(permissions.updateKDocument)
@Patch('toggle-ignored')
@UseGuards(AuthGuard('jwt'), HasPermissionGuard)
public async toggleIgnored(
@Body() data: { partnershipId: string; boxNumber: string }
) {
return this.cellMappingService.toggleIgnored(
data.partnershipId,
data.boxNumber
);
}
/**
* GET /api/v1/cell-mapping/aggregation-rules
* Get aggregation rules for a partnership.
*/
@HasPermission(permissions.readKDocument)
@Get('aggregation-rules')
@UseGuards(AuthGuard('jwt'), HasPermissionGuard)
public async getAggregationRules(
@Query('partnershipId') partnershipId?: string
) {
return this.cellMappingService.getAggregationRules(partnershipId);
}
/**
* PUT /api/v1/cell-mapping/aggregation-rules
* Update aggregation rules for a partnership.
*/
@HasPermission(permissions.updateKDocument)
@Put('aggregation-rules')
@UseGuards(AuthGuard('jwt'), HasPermissionGuard)
public async updateAggregationRules(
@Body()
data: {
partnershipId: string;
rules: Array<{
name: string;
operation: string;
sourceCells: string[];
}>;
}
) {
return this.cellMappingService.updateAggregationRules(
data.partnershipId,
data.rules
);
}
/**
* GET /api/v1/cell-mapping/aggregation-rules/compute
* Compute aggregation values for a specific KDocument (FR-036).
*/
@HasPermission(permissions.readKDocument)
@Get('aggregation-rules/compute')
@UseGuards(AuthGuard('jwt'), HasPermissionGuard)
public async computeAggregations(
@Query('kDocumentId') kDocumentId: string,
@Query('partnershipId') partnershipId?: string
) {
return this.cellMappingService.computeAggregations(
kDocumentId,
partnershipId
);
}
}

14
apps/api/src/app/cell-mapping/cell-mapping.module.ts

@ -0,0 +1,14 @@
import { PrismaModule } from '@ghostfolio/api/services/prisma/prisma.module';
import { Module } from '@nestjs/common';
import { CellMappingController } from './cell-mapping.controller';
import { CellMappingService } from './cell-mapping.service';
@Module({
controllers: [CellMappingController],
exports: [CellMappingService],
imports: [PrismaModule],
providers: [CellMappingService]
})
export class CellMappingModule {}

467
apps/api/src/app/cell-mapping/cell-mapping.service.ts

@ -0,0 +1,467 @@
import { PrismaService } from '@ghostfolio/api/services/prisma/prisma.service';
import { HttpException, Injectable, OnModuleInit } from '@nestjs/common';
import { StatusCodes } from 'http-status-codes';
/** Allowed cell types */
type CellType = 'number' | 'string' | 'percentage' | 'boolean';
/** Default IRS K-1 (Form 1065) cell mappings */
const IRS_DEFAULT_MAPPINGS: Array<{
boxNumber: string;
label: string;
description: string;
cellType: CellType;
sortOrder: number;
}> = [
// ── Header / Metadata ──────────────────────────────────────────────────
{ boxNumber: 'K1_DOCUMENT_ID', label: 'K-1 Document ID', description: 'Large-font ID at top right of K-1 form', cellType: 'string', sortOrder: 0 },
{ boxNumber: 'TAX_YEAR', label: 'Tax Year', description: 'Calendar year or tax year beginning/ending', cellType: 'string', sortOrder: 1 },
{ boxNumber: 'FINAL_K1', label: 'Final K-1', description: 'Check if this is a final K-1', cellType: 'boolean', sortOrder: 2 },
{ boxNumber: 'AMENDED_K1', label: 'Amended K-1', description: 'Check if this is an amended K-1', cellType: 'boolean', sortOrder: 3 },
// ── Part I — Information About the Partnership ─────────────────────────
{ boxNumber: 'A', label: "Partnership's EIN", description: 'Part I, Line A — Employer identification number', cellType: 'string', sortOrder: 10 },
{ boxNumber: 'B', label: "Partnership's name, address, city, state, ZIP", description: 'Part I, Line B', cellType: 'string', sortOrder: 11 },
{ boxNumber: 'C', label: 'IRS center where partnership filed return', description: 'Part I, Line C', cellType: 'string', sortOrder: 12 },
{ boxNumber: 'D', label: 'Publicly traded partnership (PTP)', description: 'Part I, Line D — Check if PTP', cellType: 'boolean', sortOrder: 13 },
// ── Part II — Information About the Partner ────────────────────────────
{ boxNumber: 'E', label: "Partner's identifying number", description: 'Part II, Line E — SSN or TIN', cellType: 'string', sortOrder: 20 },
{ boxNumber: 'F', label: "Partner's name, address, city, state, ZIP", description: 'Part II, Line F', cellType: 'string', sortOrder: 21 },
{ boxNumber: 'G_GENERAL', label: 'General partner or LLC member-manager', description: 'Part II, Line G — General partner checkbox', cellType: 'boolean', sortOrder: 22 },
{ boxNumber: 'G_LIMITED', label: 'Limited partner or other LLC member', description: 'Part II, Line G — Limited partner checkbox', cellType: 'boolean', sortOrder: 23 },
{ boxNumber: 'H1_DOMESTIC', label: 'Domestic partner', description: 'Part II, Line H1 — Domestic', cellType: 'boolean', sortOrder: 24 },
{ boxNumber: 'H1_FOREIGN', label: 'Foreign partner', description: 'Part II, Line H1 — Foreign', cellType: 'boolean', sortOrder: 25 },
{ boxNumber: 'H2', label: 'Disregarded entity (DE)', description: 'Part II, Line H2 — DE checkbox', cellType: 'boolean', sortOrder: 26 },
{ boxNumber: 'H2_TIN', label: 'Disregarded entity TIN', description: 'Part II, Line H2 — DE taxpayer ID', cellType: 'string', sortOrder: 27 },
{ boxNumber: 'I1', label: 'Type of entity', description: 'Part II, Line I1 — Entity type of partner', cellType: 'string', sortOrder: 28 },
{ boxNumber: 'I2', label: 'Retirement plan (IRA/SEP/Keogh)', description: 'Part II, Line I2 — Retirement plan checkbox', cellType: 'boolean', sortOrder: 29 },
// ── Section J — Partner's Share of Profit, Loss, and Capital ───────────
{ boxNumber: 'J_PROFIT_BEGIN', label: 'Profit — Beginning %', description: 'Section J — Profit share beginning of year', cellType: 'percentage', sortOrder: 30 },
{ boxNumber: 'J_PROFIT_END', label: 'Profit — Ending %', description: 'Section J — Profit share end of year', cellType: 'percentage', sortOrder: 31 },
{ boxNumber: 'J_LOSS_BEGIN', label: 'Loss — Beginning %', description: 'Section J — Loss share beginning of year', cellType: 'percentage', sortOrder: 32 },
{ boxNumber: 'J_LOSS_END', label: 'Loss — Ending %', description: 'Section J — Loss share end of year', cellType: 'percentage', sortOrder: 33 },
{ boxNumber: 'J_CAPITAL_BEGIN', label: 'Capital — Beginning %', description: 'Section J — Capital share beginning of year', cellType: 'percentage', sortOrder: 34 },
{ boxNumber: 'J_CAPITAL_END', label: 'Capital — Ending %', description: 'Section J — Capital share end of year', cellType: 'percentage', sortOrder: 35 },
{ boxNumber: 'J_SALE', label: 'Decrease due to sale', description: 'Section J — Check if decrease is due to sale', cellType: 'boolean', sortOrder: 36 },
{ boxNumber: 'J_EXCHANGE', label: 'Exchange of partnership interest', description: 'Section J — Check if exchange', cellType: 'boolean', sortOrder: 37 },
// ── Section K — Partner's Share of Liabilities ─────────────────────────
{ boxNumber: 'K_NONRECOURSE_BEGIN', label: 'Nonrecourse — Beginning', description: 'Section K — Nonrecourse liabilities beginning', cellType: 'number', sortOrder: 40 },
{ boxNumber: 'K_NONRECOURSE_END', label: 'Nonrecourse — Ending', description: 'Section K — Nonrecourse liabilities ending', cellType: 'number', sortOrder: 41 },
{ boxNumber: 'K_QUAL_NONRECOURSE_BEGIN', label: 'Qualified nonrecourse — Beginning', description: 'Section K — Qualified nonrecourse financing beginning', cellType: 'number', sortOrder: 42 },
{ boxNumber: 'K_QUAL_NONRECOURSE_END', label: 'Qualified nonrecourse — Ending', description: 'Section K — Qualified nonrecourse financing ending', cellType: 'number', sortOrder: 43 },
{ boxNumber: 'K_RECOURSE_BEGIN', label: 'Recourse — Beginning', description: 'Section K — Recourse liabilities beginning', cellType: 'number', sortOrder: 44 },
{ boxNumber: 'K_RECOURSE_END', label: 'Recourse — Ending', description: 'Section K — Recourse liabilities ending', cellType: 'number', sortOrder: 45 },
{ boxNumber: 'K2', label: 'Includes lower-tier partnership liabilities', description: 'Section K2 — Checkbox', cellType: 'boolean', sortOrder: 46 },
{ boxNumber: 'K3', label: 'Liability subject to guarantees', description: 'Section K3 — Checkbox', cellType: 'boolean', sortOrder: 47 },
// ── Section L — Partner's Capital Account Analysis ─────────────────────
{ boxNumber: 'L_BEG_CAPITAL', label: 'Beginning capital account', description: 'Section L — Beginning capital', cellType: 'number', sortOrder: 50 },
{ boxNumber: 'L_CONTRIBUTED', label: 'Capital contributed during year', description: 'Section L — Capital contributed', cellType: 'number', sortOrder: 51 },
{ boxNumber: 'L_CURR_YR_INCOME', label: 'Current year net income (loss)', description: 'Section L — Current year income/loss', cellType: 'number', sortOrder: 52 },
{ boxNumber: 'L_OTHER', label: 'Other increase (decrease)', description: 'Section L — Other adjustments', cellType: 'number', sortOrder: 53 },
{ boxNumber: 'L_WITHDRAWALS', label: 'Withdrawals and distributions', description: 'Section L — Withdrawals/distributions', cellType: 'number', sortOrder: 54 },
{ boxNumber: 'L_END_CAPITAL', label: 'Ending capital account', description: 'Section L — Ending capital', cellType: 'number', sortOrder: 55 },
// ── Section M — Contributed Property ───────────────────────────────────
{ boxNumber: 'M_YES', label: 'Contributed property with built-in gain/loss — Yes', description: 'Section M — Yes checkbox', cellType: 'boolean', sortOrder: 60 },
{ boxNumber: 'M_NO', label: 'Contributed property with built-in gain/loss — No', description: 'Section M — No checkbox', cellType: 'boolean', sortOrder: 61 },
// ── Section N — Net Unrecognized Section 704(c) ────────────────────────
{ boxNumber: 'N_BEGINNING', label: 'Net 704(c) gain/loss — Beginning', description: 'Section N — Beginning balance', cellType: 'number', sortOrder: 62 },
{ boxNumber: 'N_ENDING', label: 'Net 704(c) gain/loss — Ending', description: 'Section N — Ending balance', cellType: 'number', sortOrder: 63 },
// ── Part III — Partner's Share of Current Year Income, Deductions, etc. ─
{ boxNumber: '1', label: 'Ordinary business income (loss)', description: 'IRS Schedule K-1 Box 1', cellType: 'number', sortOrder: 100 },
{ boxNumber: '2', label: 'Net rental real estate income (loss)', description: 'IRS Schedule K-1 Box 2', cellType: 'number', sortOrder: 101 },
{ boxNumber: '3', label: 'Other net rental income (loss)', description: 'IRS Schedule K-1 Box 3', cellType: 'number', sortOrder: 102 },
{ boxNumber: '4', label: 'Guaranteed payments for services', description: 'IRS Schedule K-1 Box 4', cellType: 'number', sortOrder: 103 },
{ boxNumber: '4a', label: 'Guaranteed payments for capital', description: 'IRS Schedule K-1 Box 4a', cellType: 'number', sortOrder: 104 },
{ boxNumber: '4b', label: 'Total guaranteed payments', description: 'IRS Schedule K-1 Box 4b', cellType: 'number', sortOrder: 105 },
{ boxNumber: '5', label: 'Interest income', description: 'IRS Schedule K-1 Box 5', cellType: 'number', sortOrder: 106 },
{ boxNumber: '6a', label: 'Ordinary dividends', description: 'IRS Schedule K-1 Box 6a', cellType: 'number', sortOrder: 107 },
{ boxNumber: '6b', label: 'Qualified dividends', description: 'IRS Schedule K-1 Box 6b', cellType: 'number', sortOrder: 108 },
{ boxNumber: '6c', label: 'Dividend equivalents', description: 'IRS Schedule K-1 Box 6c', cellType: 'number', sortOrder: 109 },
{ boxNumber: '7', label: 'Royalties', description: 'IRS Schedule K-1 Box 7', cellType: 'number', sortOrder: 110 },
{ boxNumber: '8', label: 'Net short-term capital gain (loss)', description: 'IRS Schedule K-1 Box 8', cellType: 'number', sortOrder: 111 },
{ boxNumber: '9a', label: 'Net long-term capital gain (loss)', description: 'IRS Schedule K-1 Box 9a', cellType: 'number', sortOrder: 112 },
{ boxNumber: '9b', label: 'Collectibles (28%) gain (loss)', description: 'IRS Schedule K-1 Box 9b', cellType: 'number', sortOrder: 113 },
{ boxNumber: '9c', label: 'Unrecaptured section 1250 gain', description: 'IRS Schedule K-1 Box 9c', cellType: 'number', sortOrder: 114 },
{ boxNumber: '10', label: 'Net section 1231 gain (loss)', description: 'IRS Schedule K-1 Box 10', cellType: 'number', sortOrder: 115 },
{ boxNumber: '11', label: 'Other income (loss)', description: 'IRS Schedule K-1 Box 11', cellType: 'number', sortOrder: 116 },
{ boxNumber: '12', label: 'Section 179 deduction', description: 'IRS Schedule K-1 Box 12', cellType: 'number', sortOrder: 117 },
{ boxNumber: '13', label: 'Other deductions', description: 'IRS Schedule K-1 Box 13', cellType: 'number', sortOrder: 118 },
{ boxNumber: '14', label: 'Self-employment earnings (loss)', description: 'IRS Schedule K-1 Box 14', cellType: 'number', sortOrder: 119 },
{ boxNumber: '15', label: 'Credits', description: 'IRS Schedule K-1 Box 15', cellType: 'number', sortOrder: 120 },
{ boxNumber: '16', label: 'Foreign transactions', description: 'IRS Schedule K-1 Box 16', cellType: 'number', sortOrder: 121 },
{ boxNumber: '16_K3', label: 'Schedule K-3 is attached', description: 'IRS Schedule K-1 Box 16 K-3 checkbox', cellType: 'boolean', sortOrder: 122 },
{ boxNumber: '17', label: 'Alternative minimum tax (AMT) items', description: 'IRS Schedule K-1 Box 17', cellType: 'number', sortOrder: 123 },
{ boxNumber: '18', label: 'Tax-exempt income and nondeductible expenses', description: 'IRS Schedule K-1 Box 18', cellType: 'number', sortOrder: 124 },
{ boxNumber: '19', label: 'Distributions', description: 'IRS Schedule K-1 Box 19', cellType: 'number', sortOrder: 125 },
{ boxNumber: '19a', label: 'Distributions — Cash and marketable securities', description: 'IRS Schedule K-1 Box 19a', cellType: 'number', sortOrder: 126 },
{ boxNumber: '19b', label: 'Distributions — Other property', description: 'IRS Schedule K-1 Box 19b', cellType: 'number', sortOrder: 127 },
{ boxNumber: '20A', label: 'Other information — Code A', description: 'IRS Schedule K-1 Box 20, Code A', cellType: 'number', sortOrder: 128 },
{ boxNumber: '20B', label: 'Other information — Code B', description: 'IRS Schedule K-1 Box 20, Code B', cellType: 'number', sortOrder: 129 },
{ boxNumber: '20V', label: 'Other information — Code V', description: 'IRS Schedule K-1 Box 20, Code V', cellType: 'number', sortOrder: 130 },
{ boxNumber: '20_WILDCARD', label: 'Other information — Other codes', description: 'IRS Schedule K-1 Box 20, all other codes', cellType: 'number', sortOrder: 131 },
{ boxNumber: '21', label: 'Foreign taxes paid or accrued', description: 'IRS Schedule K-1 Box 21', cellType: 'number', sortOrder: 132 },
{ boxNumber: '22', label: 'More than one activity for at-risk purposes', description: 'IRS Schedule K-1 Box 22 — Checkbox', cellType: 'boolean', sortOrder: 133 },
{ boxNumber: '23', label: 'More than one activity for passive activity purposes', description: 'IRS Schedule K-1 Box 23 — Checkbox', cellType: 'boolean', sortOrder: 134 }
];
/** Default aggregation rules */
const DEFAULT_AGGREGATION_RULES: Array<{
name: string;
operation: string;
sourceCells: string[];
sortOrder: number;
}> = [
{
name: 'Total Ordinary Income',
operation: 'SUM',
sourceCells: ['1'],
sortOrder: 1
},
{
name: 'Total Capital Gains',
operation: 'SUM',
sourceCells: ['8', '9a', '9b', '9c', '10'],
sortOrder: 2
},
{
name: 'Total Deductions',
operation: 'SUM',
sourceCells: ['12', '13'],
sortOrder: 3
}
];
@Injectable()
export class CellMappingService implements OnModuleInit {
public constructor(private readonly prismaService: PrismaService) {}
public async onModuleInit() {
await this.seedDefaultMappings();
await this.seedDefaultAggregationRules();
}
/**
* Seed default IRS cell mappings (partnershipId = null) if they don't exist.
* Also adds any new default mappings that may have been introduced in updates.
*/
public async seedDefaultMappings() {
const existing = await this.prismaService.cellMapping.findMany({
where: { partnershipId: null }
});
const existingBoxNumbers = new Set(existing.map((m) => m.boxNumber));
const newMappings = IRS_DEFAULT_MAPPINGS.filter(
(m) => !existingBoxNumbers.has(m.boxNumber)
);
if (newMappings.length > 0) {
await this.prismaService.cellMapping.createMany({
data: newMappings.map((mapping) => ({
...mapping,
partnershipId: null,
isCustom: false,
isIgnored: false,
cellType: mapping.cellType
}))
});
}
// Backfill cellType on existing defaults that were seeded before the cellType column existed
for (const defaultMapping of IRS_DEFAULT_MAPPINGS) {
const existingRow = existing.find((e) => e.boxNumber === defaultMapping.boxNumber);
if (existingRow && (existingRow as any).cellType === 'number' && defaultMapping.cellType !== 'number') {
await this.prismaService.cellMapping.update({
where: { id: existingRow.id },
data: { cellType: defaultMapping.cellType }
});
}
}
// Clean up stale parent-level box "20" that was replaced by 20A/20B/20V/20_WILDCARD
const validBoxNumbers = new Set(IRS_DEFAULT_MAPPINGS.map((m) => m.boxNumber));
const staleDefaults = existing.filter(
(m) => !m.isCustom && !validBoxNumbers.has(m.boxNumber)
);
if (staleDefaults.length > 0) {
await this.prismaService.cellMapping.deleteMany({
where: {
id: { in: staleDefaults.map((m) => m.id) }
}
});
}
}
/**
* Seed default aggregation rules (partnershipId = null) if they don't exist
*/
public async seedDefaultAggregationRules() {
const existingCount = await this.prismaService.cellAggregationRule.count({
where: { partnershipId: null }
});
if (existingCount > 0) {
return;
}
await this.prismaService.cellAggregationRule.createMany({
data: DEFAULT_AGGREGATION_RULES.map((rule) => ({
...rule,
partnershipId: null
}))
});
}
/**
* Get cell mappings for a partnership (with global defaults for unmapped boxes)
*/
public async getMappings(partnershipId?: string) {
if (!partnershipId) {
return this.prismaService.cellMapping.findMany({
where: { partnershipId: null },
orderBy: { sortOrder: 'asc' }
});
}
// Get partnership-specific mappings
const partnershipMappings = await this.prismaService.cellMapping.findMany({
where: { partnershipId },
orderBy: { sortOrder: 'asc' }
});
// Get global defaults for any boxes not overridden
const globalMappings = await this.prismaService.cellMapping.findMany({
where: { partnershipId: null },
orderBy: { sortOrder: 'asc' }
});
const partnershipBoxNumbers = new Set(
partnershipMappings.map((m) => m.boxNumber)
);
const mergedMappings = [
...partnershipMappings,
...globalMappings.filter((g) => !partnershipBoxNumbers.has(g.boxNumber))
];
return mergedMappings.sort((a, b) => a.sortOrder - b.sortOrder);
}
/**
* Get aggregation rules for a partnership (with global defaults)
*/
public async getAggregationRules(partnershipId?: string) {
if (!partnershipId) {
return this.prismaService.cellAggregationRule.findMany({
where: { partnershipId: null },
orderBy: { sortOrder: 'asc' }
});
}
const partnershipRules =
await this.prismaService.cellAggregationRule.findMany({
where: { partnershipId },
orderBy: { sortOrder: 'asc' }
});
if (partnershipRules.length > 0) {
return partnershipRules;
}
// Fall back to global defaults
return this.prismaService.cellAggregationRule.findMany({
where: { partnershipId: null },
orderBy: { sortOrder: 'asc' }
});
}
/**
* Upsert cell mappings for a partnership.
* Creates partnership-specific overrides; does not modify global defaults.
*/
public async updateMappings(
partnershipId: string,
mappings: Array<{
boxNumber: string;
label: string;
description?: string;
cellType?: string;
isCustom: boolean;
}>
) {
const results = [];
for (let i = 0; i < mappings.length; i++) {
const mapping = mappings[i];
const updateData: Record<string, any> = {
label: mapping.label,
description: mapping.description || null,
isCustom: mapping.isCustom,
sortOrder: i + 1
};
if (mapping.cellType) {
updateData.cellType = mapping.cellType;
}
const result = await this.prismaService.cellMapping.upsert({
where: {
partnershipId_boxNumber: {
partnershipId,
boxNumber: mapping.boxNumber
}
},
update: updateData,
create: {
partnershipId,
boxNumber: mapping.boxNumber,
label: mapping.label,
description: mapping.description || null,
cellType: mapping.cellType || 'number',
isCustom: mapping.isCustom,
sortOrder: i + 1
}
});
results.push(result);
}
return results;
}
/**
* Reset a partnership's mappings to IRS defaults.
* Deletes all partnership-specific overrides.
*/
public async resetMappings(partnershipId: string) {
await this.prismaService.cellMapping.deleteMany({
where: { partnershipId }
});
return { deleted: true, partnershipId };
}
/**
* Toggle the isIgnored flag on a cell mapping.
* If a partnership-specific override exists, toggles it.
* If only the global default exists, creates a partnership-specific override with isIgnored toggled.
*/
public async toggleIgnored(
partnershipId: string,
boxNumber: string
) {
// Check for partnership-specific mapping first
const existing = await this.prismaService.cellMapping.findUnique({
where: { partnershipId_boxNumber: { partnershipId, boxNumber } }
});
if (existing) {
return this.prismaService.cellMapping.update({
where: { id: existing.id },
data: { isIgnored: !existing.isIgnored }
});
}
// No partnership override — check for global default and create an override
const globalMapping = await this.prismaService.cellMapping.findFirst({
where: { partnershipId: null, boxNumber }
});
if (globalMapping) {
return this.prismaService.cellMapping.create({
data: {
partnershipId,
boxNumber: globalMapping.boxNumber,
label: globalMapping.label,
description: globalMapping.description,
cellType: globalMapping.cellType,
isCustom: false,
isIgnored: true,
sortOrder: globalMapping.sortOrder
}
});
}
throw new HttpException(
`No cell mapping found for box ${boxNumber}`,
StatusCodes.NOT_FOUND
);
}
/**
* Update aggregation rules for a partnership.
*/
public async updateAggregationRules(
partnershipId: string,
rules: Array<{
name: string;
operation: string;
sourceCells: string[];
}>
) {
// Delete existing partnership rules and recreate
await this.prismaService.cellAggregationRule.deleteMany({
where: { partnershipId }
});
await this.prismaService.cellAggregationRule.createMany({
data: rules.map((rule, i) => ({
partnershipId,
name: rule.name,
operation: rule.operation,
sourceCells: rule.sourceCells,
sortOrder: i + 1
}))
});
return this.getAggregationRules(partnershipId);
}
/**
* Compute aggregation values for a specific KDocument (FR-036).
*/
public async computeAggregations(
kDocumentId: string,
partnershipId?: string
) {
const kDocument = await this.prismaService.kDocument.findUnique({
where: { id: kDocumentId }
});
if (!kDocument) {
throw new HttpException('KDocument not found', StatusCodes.NOT_FOUND);
}
const pId = partnershipId || kDocument.partnershipId;
const rules = await this.getAggregationRules(pId);
const data = kDocument.data as Record<string, any>;
return rules.map((rule: any) => {
const sourceCells = (rule.sourceCells || []) as string[];
const breakdown = sourceCells.map((boxNumber: string) => ({
boxNumber,
value: typeof data[boxNumber] === 'number' ? data[boxNumber] : 0
}));
let value = 0;
if (rule.operation === 'SUM') {
value = breakdown.reduce(
(sum: number, item: any) => sum + item.value,
0
);
}
return {
name: rule.name,
operation: rule.operation,
value,
breakdown
};
});
}
}

15
apps/api/src/app/k1-import/dto/confirm-k1.dto.ts

@ -0,0 +1,15 @@
import { KDocumentStatus } from '@prisma/client';
import { IsEnum, IsOptional, IsString } from 'class-validator';
/**
* DTO for confirming a verified K-1 import session.
* Triggers auto-creation of KDocument, Distributions, and Document linkage.
*/
export class ConfirmK1Dto {
@IsEnum(KDocumentStatus)
filingStatus: KDocumentStatus;
@IsOptional()
@IsString()
existingKDocumentAction?: 'UPDATE' | 'CREATE_NEW';
}

10
apps/api/src/app/k1-import/dto/upload-k1.dto.ts

@ -0,0 +1,10 @@
import { IsInt, IsString, Min } from 'class-validator';
export class UploadK1Dto {
@IsString()
partnershipId: string;
@IsInt()
@Min(1900)
taxYear: number;
}

28
apps/api/src/app/k1-import/dto/verify-k1.dto.ts

@ -0,0 +1,28 @@
import { IsArray, IsInt, IsOptional, Min, ValidateNested } from 'class-validator';
import { Type } from 'class-transformer';
import {
K1ExtractedFieldDto,
K1UnmappedItemDto
} from '@ghostfolio/common/dtos';
/**
* DTO for verifying K-1 import session.
* Re-exports shared VerifyK1ImportDto for route-level validation.
*/
export class VerifyK1Dto {
@IsInt()
@Min(1900)
taxYear: number;
@IsArray()
@ValidateNested({ each: true })
@Type(() => K1ExtractedFieldDto)
fields: K1ExtractedFieldDto[];
@IsOptional()
@IsArray()
@ValidateNested({ each: true })
@Type(() => K1UnmappedItemDto)
unmappedItems?: K1UnmappedItemDto[];
}

302
apps/api/src/app/k1-import/extractors/azure-extractor.ts

@ -0,0 +1,302 @@
import { ConfigurationService } from '@ghostfolio/api/services/configuration/configuration.service';
import type { K1ExtractionResult, K1ExtractedField } from '@ghostfolio/common/interfaces';
import { Injectable, Logger } from '@nestjs/common';
import type { K1Extractor } from './k1-extractor.interface';
/**
* Tier 2 extractor using Azure AI Document Intelligence (Layout model).
* Primary cloud OCR for scanned K-1 PDFs.
* Requires AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT and KEY to be configured.
*/
@Injectable()
export class AzureExtractor implements K1Extractor {
private readonly logger = new Logger(AzureExtractor.name);
public constructor(
private readonly configurationService: ConfigurationService
) {}
public isAvailable(): boolean {
const endpoint = this.configurationService.get(
'AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT'
);
const key = this.configurationService.get(
'AZURE_DOCUMENT_INTELLIGENCE_KEY'
);
return !!(endpoint && key);
}
public async extract(
buffer: Buffer,
fileName: string
): Promise<K1ExtractionResult> {
this.logger.log(`Extracting from scanned PDF via Azure DI: ${fileName}`);
const endpoint = this.configurationService.get(
'AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT'
);
const key = this.configurationService.get(
'AZURE_DOCUMENT_INTELLIGENCE_KEY'
);
if (!endpoint || !key) {
throw new Error(
'Azure Document Intelligence credentials not configured'
);
}
// Dynamic import to avoid loading SDK when not configured
const { AzureKeyCredential, DocumentAnalysisClient } = await import(
'@azure/ai-form-recognizer'
);
const client = new DocumentAnalysisClient(
endpoint,
new AzureKeyCredential(key)
);
// Use prebuilt-layout model for general document analysis
const poller = await client.beginAnalyzeDocument(
'prebuilt-layout',
buffer
);
const result = await poller.pollUntilDone();
const fields: K1ExtractedField[] = [];
const pageCount = result.pages?.length || 0;
// Extract key-value pairs from the document
if (result.keyValuePairs) {
for (const kvPair of result.keyValuePairs) {
const keyContent = kvPair.key?.content?.trim();
const valueContent = kvPair.value?.content?.trim();
const confidence = kvPair.confidence || 0;
if (!keyContent || !valueContent) continue;
// Try to match key to a K-1 box number
const boxNumber = this.matchKeyToBoxNumber(keyContent);
if (boxNumber) {
const numericValue = this.parseNumericValue(valueContent);
let confidenceLevel: 'HIGH' | 'MEDIUM' | 'LOW';
if (confidence >= 0.85) {
confidenceLevel = 'HIGH';
} else if (confidence >= 0.6) {
confidenceLevel = 'MEDIUM';
} else {
confidenceLevel = 'LOW';
}
fields.push({
boxNumber,
label: '', // Will be filled by field mapper
customLabel: null,
rawValue: valueContent,
numericValue,
confidence: Math.round(confidence * 100) / 100,
confidenceLevel,
isUserEdited: false,
isReviewed: false
});
}
}
}
// Extract tables (K-1 forms often use tabular layout)
if (result.tables) {
for (const table of result.tables) {
this.extractFieldsFromTable(table, fields);
}
}
// Extract metadata from the full text
const fullText = result.content || '';
const metadata = this.extractMetadata(fullText);
const totalConfidence = fields.reduce((sum, f) => sum + f.confidence, 0);
const overallConfidence =
fields.length > 0 ? totalConfidence / fields.length : 0;
return {
metadata,
fields,
unmappedItems: [],
overallConfidence: Math.round(overallConfidence * 100) / 100,
method: 'azure',
pagesProcessed: pageCount
};
}
private matchKeyToBoxNumber(key: string): string | null {
// Match patterns like "1", "6a", "19a", "Box 1", "Line 1"
const boxPatterns: Array<{ pattern: RegExp; box: string }> = [
{ pattern: /^(?:box\s*)?1(?:\s|$|\b)/i, box: '1' },
{ pattern: /^(?:box\s*)?2(?:\s|$|\b)/i, box: '2' },
{ pattern: /^(?:box\s*)?3(?:\s|$|\b)/i, box: '3' },
{ pattern: /^(?:box\s*)?4a(?:\s|$|\b)/i, box: '4a' },
{ pattern: /^(?:box\s*)?4b(?:\s|$|\b)/i, box: '4b' },
{ pattern: /^(?:box\s*)?4(?:\s|$|\b)/i, box: '4' },
{ pattern: /^(?:box\s*)?5(?:\s|$|\b)/i, box: '5' },
{ pattern: /^(?:box\s*)?6a(?:\s|$|\b)/i, box: '6a' },
{ pattern: /^(?:box\s*)?6b(?:\s|$|\b)/i, box: '6b' },
{ pattern: /^(?:box\s*)?6c(?:\s|$|\b)/i, box: '6c' },
{ pattern: /^(?:box\s*)?7(?:\s|$|\b)/i, box: '7' },
{ pattern: /^(?:box\s*)?8(?:\s|$|\b)/i, box: '8' },
{ pattern: /^(?:box\s*)?9a(?:\s|$|\b)/i, box: '9a' },
{ pattern: /^(?:box\s*)?9b(?:\s|$|\b)/i, box: '9b' },
{ pattern: /^(?:box\s*)?9c(?:\s|$|\b)/i, box: '9c' },
{ pattern: /^(?:box\s*)?10(?:\s|$|\b)/i, box: '10' },
{ pattern: /^(?:box\s*)?11(?:\s|$|\b)/i, box: '11' },
{ pattern: /^(?:box\s*)?12(?:\s|$|\b)/i, box: '12' },
{ pattern: /^(?:box\s*)?13(?:\s|$|\b)/i, box: '13' },
{ pattern: /^(?:box\s*)?14(?:\s|$|\b)/i, box: '14' },
{ pattern: /^(?:box\s*)?15(?:\s|$|\b)/i, box: '15' },
{ pattern: /^(?:box\s*)?16(?:\s|$|\b)/i, box: '16' },
{ pattern: /^(?:box\s*)?17(?:\s|$|\b)/i, box: '17' },
{ pattern: /^(?:box\s*)?18(?:\s|$|\b)/i, box: '18' },
{ pattern: /^(?:box\s*)?19a(?:\s|$|\b)/i, box: '19a' },
{ pattern: /^(?:box\s*)?19b(?:\s|$|\b)/i, box: '19b' },
{ pattern: /^(?:box\s*)?20(?:\s|$|\b)/i, box: '20' },
{ pattern: /^(?:box\s*)?21(?:\s|$|\b)/i, box: '21' }
];
// Also match by label keywords
const labelPatterns: Array<{ pattern: RegExp; box: string }> = [
{ pattern: /ordinary\s+business\s+income/i, box: '1' },
{ pattern: /net\s+rental\s+real\s+estate/i, box: '2' },
{ pattern: /other\s+net\s+rental/i, box: '3' },
{ pattern: /guaranteed\s+payments?\s+for\s+services/i, box: '4' },
{ pattern: /guaranteed\s+payments?\s+for\s+capital/i, box: '4a' },
{ pattern: /total\s+guaranteed\s+payments/i, box: '4b' },
{ pattern: /interest\s+income/i, box: '5' },
{ pattern: /ordinary\s+dividends/i, box: '6a' },
{ pattern: /qualified\s+dividends/i, box: '6b' },
{ pattern: /dividend\s+equivalents/i, box: '6c' },
{ pattern: /royalties/i, box: '7' },
{ pattern: /net\s+short[- ]term\s+capital/i, box: '8' },
{ pattern: /net\s+long[- ]term\s+capital/i, box: '9a' },
{ pattern: /collectibles.*28%/i, box: '9b' },
{ pattern: /unrecaptured\s+section\s*1250/i, box: '9c' },
{ pattern: /net\s+section\s*1231/i, box: '10' },
{ pattern: /section\s+179\s+deduction/i, box: '12' },
{ pattern: /self[- ]employment\s+earnings/i, box: '14' },
{ pattern: /foreign\s+taxes\s+paid/i, box: '21' }
];
for (const { pattern, box } of boxPatterns) {
if (pattern.test(key)) return box;
}
for (const { pattern, box } of labelPatterns) {
if (pattern.test(key)) return box;
}
return null;
}
private extractFieldsFromTable(table: any, fields: K1ExtractedField[]) {
if (!table.cells) return;
const existingBoxes = new Set(fields.map((f) => f.boxNumber));
// Group cells by row
const rows = new Map<number, any[]>();
for (const cell of table.cells) {
const rowIndex = cell.rowIndex;
if (!rows.has(rowIndex)) {
rows.set(rowIndex, []);
}
rows.get(rowIndex).push(cell);
}
for (const [, rowCells] of rows) {
if (rowCells.length < 2) continue;
// Sort by column index
rowCells.sort((a: any, b: any) => a.columnIndex - b.columnIndex);
const keyCell = rowCells[0]?.content?.trim();
const valueCell = rowCells[rowCells.length - 1]?.content?.trim();
if (!keyCell || !valueCell) continue;
const boxNumber = this.matchKeyToBoxNumber(keyCell);
if (boxNumber && !existingBoxes.has(boxNumber)) {
const numericValue = this.parseNumericValue(valueCell);
fields.push({
boxNumber,
label: '',
customLabel: null,
rawValue: valueCell,
numericValue,
confidence: 0.7, // Table extraction is less reliable
confidenceLevel: 'MEDIUM',
isUserEdited: false,
isReviewed: false
});
existingBoxes.add(boxNumber);
}
}
}
private extractMetadata(text: string): K1ExtractionResult['metadata'] {
return {
partnershipName: this.extractPattern(
text,
/partnership['']s?\s+name[^:\n]*[:\s]+([^\n]{3,80})/i
),
partnershipEin: this.extractPattern(
text,
/partnership['']s?\s+(?:employer\s+identification\s+number|EIN)[^:\n]*[:\s]+(\d{2}[- ]\d{7})/i
),
partnerName: this.extractPattern(
text,
/partner['']s?\s+name[^:\n]*[:\s]+([^\n]{3,80})/i
),
partnerEin: this.extractPattern(
text,
/partner['']s?\s+(?:identifying|social\s+security)\s+number[^:\n]*[:\s]+(\d{2}[- ]\d{7}|\d{3}[- ]\d{2}[- ]\d{4})/i
),
taxYear: this.extractTaxYear(text),
isAmended: /amended/i.test(text),
isFinal: /final\s+k-?1/i.test(text) || /final\s+return/i.test(text)
};
}
private extractPattern(text: string, pattern: RegExp): string | null {
const match = text.match(pattern);
return match ? match[1].trim() : null;
}
private extractTaxYear(text: string): number | null {
const yearPatterns = [
/(?:calendar\s+year|tax\s+year)\s*(\d{4})/i,
/for\s+(?:calendar\s+year|tax\s+year)\s*(\d{4})/i
];
for (const pattern of yearPatterns) {
const match = text.match(pattern);
if (match) {
const year = parseInt(match[1], 10);
if (year >= 1900 && year <= 2100) return year;
}
}
return null;
}
private parseNumericValue(raw: string): number | null {
if (!raw) return null;
let cleaned = raw.replace(/\s/g, '');
const isNegative =
cleaned.startsWith('(') ||
cleaned.startsWith('-') ||
cleaned.startsWith('($');
cleaned = cleaned.replace(/[$,()]/g, '').replace(/^-/, '');
const num = parseFloat(cleaned);
if (isNaN(num)) return null;
return isNegative ? -num : num;
}
}

22
apps/api/src/app/k1-import/extractors/k1-extractor.interface.ts

@ -0,0 +1,22 @@
import type { K1ExtractionResult } from '@ghostfolio/common/interfaces';
/**
* Interface for K-1 PDF data extractors.
* Each extractor implements a different extraction strategy
* (pdf-parse for digital PDFs, Azure DI for scanned, tesseract as fallback).
*/
export interface K1Extractor {
/**
* Extract structured K-1 data from a PDF buffer.
* @param buffer - The PDF file content as a Buffer
* @param fileName - Original filename of the uploaded PDF
* @returns Extracted K-1 fields with confidence scores
*/
extract(buffer: Buffer, fileName: string): Promise<K1ExtractionResult>;
/**
* Check if this extractor is available/configured.
* For example, Azure extractor requires API keys to be configured.
*/
isAvailable(): boolean;
}

1325
apps/api/src/app/k1-import/extractors/k1-position-regions.ts

File diff suppressed because it is too large

967
apps/api/src/app/k1-import/extractors/pdf-parse-extractor.ts

@ -0,0 +1,967 @@
import type {
K1ExtractionResult,
K1ExtractedField,
K1UnmappedItem
} from '@ghostfolio/common/interfaces';
import { Injectable, Logger } from '@nestjs/common';
import { resolve } from 'path';
import type { K1Extractor } from './k1-extractor.interface';
import {
K1_POSITION_REGIONS,
POSITION_TOLERANCE,
SUBTYPE_Y_TOLERANCE,
type K1PositionRegion
} from './k1-position-regions';
// ============================================================================
// Internal types for pdfjs-dist text extraction
// ============================================================================
interface PdfTextItem {
str: string;
transform: number[]; // [scaleX, skewY, skewX, scaleY, x, y]
width: number;
height: number;
fontName: string;
hasEOL: boolean;
dir: string;
}
interface PdfTextStyle {
fontFamily: string;
ascent: number;
descent: number;
vertical: boolean;
}
interface DataItem {
text: string;
x: number;
y: number;
fontName: string;
fontFamily: string;
matched: boolean;
}
/**
* Tier 1 extractor for digitally-generated K-1 PDFs.
* Uses pdfjs-dist position-based text extraction with font discrimination
* to accurately map values to K-1 form fields by (x, y) coordinates.
*/
@Injectable()
export class PdfParseExtractor implements K1Extractor {
private readonly logger = new Logger(PdfParseExtractor.name);
public isAvailable(): boolean {
return true; // Always available — no external dependencies
}
// ==========================================================================
// T003: Main extraction entry point — pdfjs-dist scaffold
// ==========================================================================
public async extract(
buffer: Buffer,
fileName: string
): Promise<K1ExtractionResult> {
this.logger.log(`Extracting from digital PDF: ${fileName}`);
// Dynamic import — API project compiles to CommonJS via webpack
const { getDocument, GlobalWorkerOptions } = await import(
'pdfjs-dist/legacy/build/pdf.mjs'
);
// Configure worker
const workerPath =
'file:///' +
resolve(
'node_modules/pdfjs-dist/legacy/build/pdf.worker.mjs'
).replace(/\\/g, '/');
GlobalWorkerOptions.workerSrc = workerPath;
let pdfDoc: any = null;
try {
const loadingTask = getDocument({
data: new Uint8Array(buffer),
standardFontDataUrl:
resolve('node_modules/pdfjs-dist/standard_fonts') + '/',
cMapUrl: resolve('node_modules/pdfjs-dist/cmaps') + '/',
cMapPacked: true,
isEvalSupported: false,
disableFontFace: true
});
pdfDoc = await loadingTask.promise;
const pageCount = pdfDoc.numPages;
// T024: Process only page 1 (FR-024)
const page = await pdfDoc.getPage(1);
const textContent = await page.getTextContent({
includeMarkedContent: false
});
const items = textContent.items as PdfTextItem[];
const styles = textContent.styles as Record<string, PdfTextStyle>;
if (items.length === 0) {
this.logger.warn(`No text items found in ${fileName}`);
return this.emptyResult(pageCount);
}
// T004: Font discrimination
const dataItems = this.filterDataItems(items, styles);
if (dataItems.length === 0) {
this.logger.warn(
`No data-font items found in ${fileName} (${items.length} total items)`
);
return this.emptyResult(pageCount);
}
this.logger.log(
`Found ${dataItems.length} data items out of ${items.length} total (${fileName})`
);
// Extract all field categories
const fields: K1ExtractedField[] = [];
const metadata = this.initMetadata();
// Checkboxes first — consume "X" marks before Part III so the
// BOX_16_K3 checkbox doesn't get grabbed as a BOX_16 value.
this.extractCheckboxes(dataItems, fields, metadata);
// T007-T010 (US1): Part III extraction
this.extractPartIII(dataItems, fields);
// T011-T014 (US2): Header + Part I/II metadata
this.extractMetadata(dataItems, fields, metadata);
// T015-T018 (US3): Sections J/K/L/M/N
this.extractSections(dataItems, fields);
// T021 (US5): Unmapped items
const unmappedItems = this.collectUnmappedItems(dataItems);
// T024: Confidence scoring
const overallConfidence = this.computeOverallConfidence(fields);
return {
metadata,
fields,
unmappedItems,
overallConfidence,
method: 'pdf-parse',
pagesProcessed: pageCount
};
} catch (error: unknown) {
// T023: Graceful error handling
const message =
error instanceof Error ? error.message : String(error);
this.logger.error(
`Extraction failed for ${fileName}: ${message}`
);
return this.emptyResult(1);
} finally {
// T025: Cleanup — always destroy
if (pdfDoc) {
try {
await pdfDoc.destroy();
} catch {
// Ignore cleanup errors
}
}
}
}
// ==========================================================================
// T004: Font discrimination — identify data fonts vs template fonts
// ==========================================================================
private filterDataItems(
items: PdfTextItem[],
styles: Record<string, PdfTextStyle>
): DataItem[] {
const dataItems: DataItem[] = [];
for (const item of items) {
const text = item.str.trim();
if (!text) continue;
const style = styles[item.fontName];
if (!style) continue;
const fontFamily = style.fontFamily.toLowerCase();
// Template text uses serif fonts; data values use sans-serif or monospace
if (fontFamily === 'serif') continue;
dataItems.push({
text,
x: item.transform[4],
y: item.transform[5],
fontName: item.fontName,
fontFamily,
matched: false
});
}
return dataItems;
}
// ==========================================================================
// T006: Value parsing utility
// ==========================================================================
/**
* Parse a K-1 value string to a number.
* Rules:
* 1. Remove commas
* 2. Parenthesized = negative
* 3. Leading minus = negative
* 4. Strip dollar sign
* 5. Preserve decimal percentages
* 6. "SEE STMT" null
* 7. "X" (checkbox) null
* 8. Empty null
* 9. Text like "E-FILE" null
*/
public parseNumericValue(raw: string): number | null {
if (!raw) return null;
const trimmed = raw.trim();
if (!trimmed) return null;
// Non-numeric text values
const upper = trimmed.toUpperCase();
if (
upper === 'SEE STMT' ||
upper === 'STMT' ||
upper === 'SEE STATEMENT' ||
upper === 'X' ||
upper === 'E-FILE' ||
upper === 'YES' ||
upper === 'NO'
) {
return null;
}
let cleaned = trimmed;
// Detect negative: parenthesized
const isParenNegative = /^\(.*\)$/.test(cleaned);
// Strip $, commas, parens
cleaned = cleaned.replace(/[$,()]/g, '');
// Detect leading minus
const isMinusNegative = cleaned.startsWith('-');
if (isMinusNegative) {
cleaned = cleaned.substring(1);
}
// Try parsing
const num = parseFloat(cleaned);
if (isNaN(num)) return null;
return isParenNegative || isMinusNegative ? -num : num;
}
// ==========================================================================
// T024: Confidence scoring based on position distance
// ==========================================================================
private computeConfidence(
x: number,
y: number,
region: K1PositionRegion
): { confidence: number; confidenceLevel: 'HIGH' | 'MEDIUM' | 'LOW' } {
const regionCenterX = (region.xMin + region.xMax) / 2;
const regionCenterY = (region.yMin + region.yMax) / 2;
const dx = Math.abs(x - regionCenterX);
const dy = Math.abs(y - regionCenterY);
const distance = Math.max(dx, dy);
// Half the region size is the "ideal" zone
const regionHalfW = (region.xMax - region.xMin) / 2;
const regionHalfH = (region.yMax - region.yMin) / 2;
const idealRadius = Math.max(regionHalfW, regionHalfH);
if (distance <= idealRadius) {
return { confidence: 0.95, confidenceLevel: 'HIGH' };
} else if (distance <= idealRadius + 5) {
return { confidence: 0.85, confidenceLevel: 'MEDIUM' };
} else {
return { confidence: 0.65, confidenceLevel: 'LOW' };
}
}
private computeOverallConfidence(fields: K1ExtractedField[]): number {
if (fields.length === 0) return 0;
const total = fields.reduce((sum, f) => sum + f.confidence, 0);
return Math.round((total / fields.length) * 100) / 100;
}
// ==========================================================================
// T023: Empty result helper for error/empty cases
// ==========================================================================
private emptyResult(pageCount: number): K1ExtractionResult {
return {
metadata: this.initMetadata(),
fields: [],
unmappedItems: [],
overallConfidence: 0,
method: 'pdf-parse',
pagesProcessed: pageCount
};
}
private initMetadata(): K1ExtractionResult['metadata'] {
return {
partnershipName: null,
partnershipEin: null,
partnerName: null,
partnerEin: null,
taxYear: null,
isAmended: false,
isFinal: false
};
}
// ==========================================================================
// T007-T010 (US1): Part III extraction — boxes 1-21 with subtypes
// ==========================================================================
private extractPartIII(
dataItems: DataItem[],
fields: K1ExtractedField[]
): void {
const partIIIRegions = K1_POSITION_REGIONS.filter(
(r) =>
r.fieldCategory === 'PART_III' &&
r.valueType !== 'checkbox'
);
// CRITICAL: Process subtype regions FIRST (right column boxes 14-21
// and left column boxes 11-13). This prevents left-column simple
// regions from stealing right-column subtype codes at x~455.
const subtypeRegions = partIIIRegions.filter((r) => r.hasSubtype);
const simpleRegions = partIIIRegions.filter((r) => !r.hasSubtype);
for (const region of subtypeRegions) {
this.extractSubtypeField(dataItems, fields, region);
}
for (const region of simpleRegions) {
this.extractSimpleField(dataItems, fields, region);
}
}
/**
* T008: Pair subtype code + value for a region.
* T009: Handle multi-subtype boxes (e.g., box 20 with A, B, Z, *).
*/
private extractSubtypeField(
dataItems: DataItem[],
fields: K1ExtractedField[],
region: K1PositionRegion
): void {
// Find all code items in the subtype column within this region's y-range
const codes: DataItem[] = [];
const values: DataItem[] = [];
for (const item of dataItems) {
if (item.matched) continue;
const inYRange =
item.y >= region.yMin - POSITION_TOLERANCE &&
item.y <= region.yMax + POSITION_TOLERANCE;
if (!inYRange) continue;
// Is it in the subtype code column?
if (
region.subtypeXMin !== null &&
region.subtypeXMax !== null &&
item.x >= region.subtypeXMin - POSITION_TOLERANCE &&
item.x <= region.subtypeXMax + POSITION_TOLERANCE
) {
codes.push(item);
}
// Is it in the value column?
else if (
item.x >= region.xMin - POSITION_TOLERANCE &&
item.x <= region.xMax + POSITION_TOLERANCE
) {
values.push(item);
}
}
// If we have codes, pair each code with the closest value at same y
if (codes.length > 0) {
for (const code of codes) {
// Find value at same y-band (±8pts)
const pairedValue = values.find(
(v) =>
!v.matched &&
Math.abs(v.y - code.y) <= SUBTYPE_Y_TOLERANCE
);
const rawValue = pairedValue ? pairedValue.text : '';
const numericValue = this.parseNumericValue(rawValue);
const { confidence, confidenceLevel } = this.computeConfidence(
code.x,
code.y,
region
);
fields.push({
boxNumber: region.boxNumber,
label: region.label,
customLabel: null,
rawValue,
numericValue,
confidence,
confidenceLevel,
isUserEdited: false,
isReviewed: false,
subtype: code.text.trim(),
fieldCategory: region.fieldCategory,
isCheckbox: false
});
code.matched = true;
if (pairedValue) pairedValue.matched = true;
}
}
// If no codes but we have values, treat as simple single-value field
else if (values.length > 0) {
const item = values[0];
const numericValue = this.parseNumericValue(item.text);
const { confidence, confidenceLevel } = this.computeConfidence(
item.x,
item.y,
region
);
fields.push({
boxNumber: region.boxNumber,
label: region.label,
customLabel: null,
rawValue: item.text,
numericValue,
confidence,
confidenceLevel,
isUserEdited: false,
isReviewed: false,
subtype: null,
fieldCategory: region.fieldCategory,
isCheckbox: false
});
item.matched = true;
}
}
/**
* Simple non-subtype field extraction match the closest data item
* within the region's bounding box.
*/
private extractSimpleField(
dataItems: DataItem[],
fields: K1ExtractedField[],
region: K1PositionRegion
): void {
const item = this.findBestItemInRegion(dataItems, region);
if (!item) return;
const numericValue =
region.valueType === 'checkbox' || region.valueType === 'text'
? null
: this.parseNumericValue(item.text);
const { confidence, confidenceLevel } = this.computeConfidence(
item.x,
item.y,
region
);
fields.push({
boxNumber: region.boxNumber,
label: region.label,
customLabel: null,
rawValue: item.text,
numericValue,
confidence,
confidenceLevel,
isUserEdited: false,
isReviewed: false,
subtype: null,
fieldCategory: region.fieldCategory,
isCheckbox: region.valueType === 'checkbox'
});
item.matched = true;
}
// ==========================================================================
// T011-T014 (US2): Metadata extraction — header, Part I, Part II
// ==========================================================================
private extractMetadata(
dataItems: DataItem[],
fields: K1ExtractedField[],
metadata: K1ExtractionResult['metadata']
): void {
// Header regions: tax year
const taxYearItems: DataItem[] = [];
for (const item of dataItems) {
if (item.matched) continue;
// Tax year region: near top of page, x around 200-350
if (item.y > 710 && item.x > 200 && item.x < 350) {
// Look for 2-digit or 4-digit year fragments
if (/^\d{2,4}$/.test(item.text)) {
taxYearItems.push(item);
}
}
}
// Combine year fragments (e.g., "20" + "25" → 2025)
if (taxYearItems.length >= 2) {
// Sort by x position
taxYearItems.sort((a, b) => a.x - b.x);
const combined = taxYearItems.map((i) => i.text).join('');
const year = parseInt(combined, 10);
if (year >= 1900 && year <= 2100) {
metadata.taxYear = year;
for (const item of taxYearItems) {
item.matched = true;
}
// Also emit as a field so it appears in cell mapping
fields.push({
boxNumber: 'TAX_YEAR',
label: 'Tax Year',
customLabel: null,
rawValue: String(year),
numericValue: null,
confidence: 1.0,
confidenceLevel: 'HIGH',
isUserEdited: false,
isReviewed: false,
subtype: null,
fieldCategory: 'METADATA',
isCheckbox: false
});
}
} else if (taxYearItems.length === 1) {
const text = taxYearItems[0].text;
const year = parseInt(text, 10);
if (text.length === 4 && year >= 1900 && year <= 2100) {
metadata.taxYear = year;
taxYearItems[0].matched = true;
fields.push({
boxNumber: 'TAX_YEAR',
label: 'Tax Year',
customLabel: null,
rawValue: String(year),
numericValue: null,
confidence: 1.0,
confidenceLevel: 'HIGH',
isUserEdited: false,
isReviewed: false,
subtype: null,
fieldCategory: 'METADATA',
isCheckbox: false
});
}
}
// Part I: Partnership info
this.extractTextMetadata(dataItems, 'A_EIN', metadata, 'partnershipEin');
this.extractTextMetadata(dataItems, 'B_NAME', metadata, 'partnershipName');
this.extractTextMetadata(dataItems, 'C_IRS_CENTER', metadata, null);
// Part II: Partner info
this.extractTextMetadata(dataItems, 'E_TIN', metadata, 'partnerEin');
this.extractTextMetadata(dataItems, 'F_NAME_ADDR', metadata, 'partnerName');
// Extract remaining metadata text fields into the fields array
const metadataRegions = K1_POSITION_REGIONS.filter(
(r) =>
r.fieldCategory === 'METADATA' &&
r.valueType === 'text'
);
for (const region of metadataRegions) {
this.extractSimpleField(dataItems, fields, region);
}
}
/**
* Match data items to a metadata region and set the corresponding
* metadata property. Collects multiple items in the same region
* (e.g., multi-line names/addresses).
*/
private extractTextMetadata(
dataItems: DataItem[],
regionFieldId: string,
metadata: K1ExtractionResult['metadata'],
metadataKey: keyof K1ExtractionResult['metadata'] | null
): void {
const region = K1_POSITION_REGIONS.find(
(r) => r.fieldId === regionFieldId
);
if (!region) return;
const matches: DataItem[] = [];
for (const item of dataItems) {
if (item.matched) continue;
if (
item.x >= region.xMin - POSITION_TOLERANCE &&
item.x <= region.xMax + POSITION_TOLERANCE &&
item.y >= region.yMin - POSITION_TOLERANCE &&
item.y <= region.yMax + POSITION_TOLERANCE
) {
matches.push(item);
}
}
if (matches.length === 0) return;
// Sort by y descending (top of page first in PDF coords)
matches.sort((a, b) => b.y - a.y);
const combinedText = matches.map((m) => m.text).join(' ').trim();
if (metadataKey && combinedText) {
if (metadataKey === 'taxYear') {
const year = parseInt(combinedText, 10);
if (year >= 1900 && year <= 2100) {
metadata.taxYear = year;
}
} else if (
metadataKey === 'isFinal' ||
metadataKey === 'isAmended'
) {
(metadata as any)[metadataKey] = true;
} else {
(metadata as any)[metadataKey] = combinedText;
}
}
for (const item of matches) {
item.matched = true;
}
}
// ==========================================================================
// T015-T018 (US3): Section J/K/L/M/N extraction
// Uses closest-center assignment so closely-spaced rows (Section L has
// 12pt row spacing, smaller than POSITION_TOLERANCE=15) get correct mapping.
// ==========================================================================
private extractSections(
dataItems: DataItem[],
fields: K1ExtractedField[]
): void {
const sectionCategories = [
'SECTION_J',
'SECTION_K',
'SECTION_L',
'SECTION_M',
'SECTION_N'
];
for (const category of sectionCategories) {
const regions = K1_POSITION_REGIONS.filter(
(r) =>
r.fieldCategory === category &&
r.valueType !== 'checkbox'
);
const assignments = this.assignItemsToRegions(dataItems, regions);
for (const [region, item] of assignments) {
const numericValue = this.parseNumericValue(item.text);
const { confidence, confidenceLevel } = this.computeConfidence(
item.x,
item.y,
region
);
fields.push({
boxNumber: region.boxNumber,
label: region.label,
customLabel: null,
rawValue: item.text,
numericValue,
confidence,
confidenceLevel,
isUserEdited: false,
isReviewed: false,
subtype: null,
fieldCategory: region.fieldCategory,
isCheckbox: false
});
item.matched = true;
}
}
}
// ==========================================================================
// T019-T020 (US4): Checkbox extraction
// Uses closest-center assignment to prevent adjacent checkbox regions
// (e.g., G_GENERAL/G_LIMITED, M_YES/M_NO) from stealing each other's marks.
// ==========================================================================
private extractCheckboxes(
dataItems: DataItem[],
fields: K1ExtractedField[],
metadata: K1ExtractionResult['metadata']
): void {
const checkboxRegions = K1_POSITION_REGIONS.filter(
(r) => r.valueType === 'checkbox'
);
const assignments = this.assignItemsToRegions(dataItems, checkboxRegions);
// Track which checkbox regions were matched (checked)
const checkedRegionIds = new Set<string>();
for (const [region, item] of assignments) {
const isChecked =
item.text.toUpperCase() === 'X' ||
item.text.toUpperCase() === '✓' ||
item.text.toUpperCase() === '✗';
if (!isChecked) continue;
checkedRegionIds.add(region.fieldId);
const { confidence, confidenceLevel } = this.computeConfidence(
item.x,
item.y,
region
);
fields.push({
boxNumber: region.boxNumber,
label: region.label,
customLabel: null,
rawValue: 'true',
numericValue: null,
confidence,
confidenceLevel,
isUserEdited: false,
isReviewed: false,
subtype: null,
fieldCategory: 'CHECKBOX',
isCheckbox: true
});
item.matched = true;
// Set metadata flags for known checkboxes
if (region.fieldId === 'FINAL_K1') {
metadata.isFinal = true;
} else if (region.fieldId === 'AMENDED_K1') {
metadata.isAmended = true;
}
}
// Emit false for all unchecked checkbox regions
for (const region of checkboxRegions) {
if (checkedRegionIds.has(region.fieldId)) continue;
fields.push({
boxNumber: region.boxNumber,
label: region.label,
customLabel: null,
rawValue: 'false',
numericValue: null,
confidence: 1.0,
confidenceLevel: 'HIGH',
isUserEdited: false,
isReviewed: false,
subtype: null,
fieldCategory: 'CHECKBOX',
isCheckbox: true
});
}
}
// ==========================================================================
// T021 (US5): Unmapped items collection
// ==========================================================================
private collectUnmappedItems(dataItems: DataItem[]): K1UnmappedItem[] {
const unmapped: K1UnmappedItem[] = [];
for (const item of dataItems) {
if (item.matched) continue;
// Skip very short items that are likely noise (single digits, etc.)
if (item.text.length <= 1 && !/\d/.test(item.text) && item.text !== 'X') {
continue;
}
const numericValue = this.parseNumericValue(item.text);
unmapped.push({
rawLabel: '',
rawValue: item.text,
numericValue,
confidence: 0.5,
pageNumber: 1,
resolution: null,
assignedBoxNumber: null,
x: Math.round(item.x * 10) / 10,
y: Math.round(item.y * 10) / 10,
fontName: item.fontName
});
}
return unmapped;
}
// ==========================================================================
// T005: Position matching helpers
// ==========================================================================
/**
* Find the single best (closest to center) unmatched item in a region.
* Used for isolated fields where only one region is being checked.
*/
private findBestItemInRegion(
dataItems: DataItem[],
region: K1PositionRegion
): DataItem | null {
let bestItem: DataItem | null = null;
let bestDistance = Infinity;
const regionCenterX = (region.xMin + region.xMax) / 2;
const regionCenterY = (region.yMin + region.yMax) / 2;
for (const item of dataItems) {
if (item.matched) continue;
if (
item.x >= region.xMin - POSITION_TOLERANCE &&
item.x <= region.xMax + POSITION_TOLERANCE &&
item.y >= region.yMin - POSITION_TOLERANCE &&
item.y <= region.yMax + POSITION_TOLERANCE
) {
const dx = Math.abs(item.x - regionCenterX);
const dy = Math.abs(item.y - regionCenterY);
const distance = Math.sqrt(dx * dx + dy * dy);
if (distance < bestDistance) {
bestDistance = distance;
bestItem = item;
}
}
}
return bestItem;
}
/**
* Closest-center assignment across a batch of regions.
* Builds all (item, region, distance) candidates, then greedily assigns
* by smallest distance first. Each region gets at most one item and each
* item is used at most once. This prevents adjacent/overlapping regions
* (e.g., G_GENERAL/G_LIMITED at boundary x=178, Section L rows 12pt apart)
* from stealing each other's data via tolerance-window overlap.
*/
private assignItemsToRegions(
dataItems: DataItem[],
regions: K1PositionRegion[]
): Map<K1PositionRegion, DataItem> {
const candidates: {
item: DataItem;
region: K1PositionRegion;
distance: number;
}[] = [];
for (const item of dataItems) {
if (item.matched) continue;
for (const region of regions) {
if (
item.x >= region.xMin - POSITION_TOLERANCE &&
item.x <= region.xMax + POSITION_TOLERANCE &&
item.y >= region.yMin - POSITION_TOLERANCE &&
item.y <= region.yMax + POSITION_TOLERANCE
) {
const cx = (region.xMin + region.xMax) / 2;
const cy = (region.yMin + region.yMax) / 2;
const dx = Math.abs(item.x - cx);
const dy = Math.abs(item.y - cy);
candidates.push({
item,
region,
distance: Math.sqrt(dx * dx + dy * dy)
});
}
}
}
// Sort by distance — closest matches first
candidates.sort((a, b) => a.distance - b.distance);
// Greedy assignment: each region and item used at most once
const result = new Map<K1PositionRegion, DataItem>();
const usedItems = new Set<DataItem>();
for (const { item, region } of candidates) {
if (usedItems.has(item) || result.has(region)) continue;
result.set(region, item);
usedItems.add(item);
}
return result;
}
// ==========================================================================
// Preserved: isDigitalK1 — used by isAvailable() and external callers
// ==========================================================================
/**
* Detect if the PDF is a digital (text-embedded) K-1 document.
* Returns true if sufficient text is found with K-1 keywords.
*/
public async isDigitalK1(buffer: Buffer): Promise<boolean> {
try {
const { getDocument, GlobalWorkerOptions } = await import(
'pdfjs-dist/legacy/build/pdf.mjs'
);
const workerPath =
'file:///' +
resolve(
'node_modules/pdfjs-dist/legacy/build/pdf.worker.mjs'
).replace(/\\/g, '/');
GlobalWorkerOptions.workerSrc = workerPath;
const loadingTask = getDocument({
data: new Uint8Array(buffer),
isEvalSupported: false,
disableFontFace: true
});
const pdfDoc = await loadingTask.promise;
try {
const page = await pdfDoc.getPage(1);
const textContent = await page.getTextContent({
includeMarkedContent: false
});
const text = (textContent.items as PdfTextItem[])
.map((item) => item.str)
.join(' ');
if (text.length < 100) return false;
const k1Keywords = [
'Schedule K-1',
'Form 1065',
"Partner's Share"
];
return k1Keywords.some((kw) => text.includes(kw));
} finally {
await pdfDoc.destroy();
}
} catch {
return false;
}
}
}

208
apps/api/src/app/k1-import/extractors/tesseract-extractor.ts

@ -0,0 +1,208 @@
import type { K1ExtractionResult, K1ExtractedField } from '@ghostfolio/common/interfaces';
import { Injectable, Logger } from '@nestjs/common';
import { PdfParseExtractor } from './pdf-parse-extractor';
import type { K1Extractor } from './k1-extractor.interface';
/**
* Tier 2 fallback extractor using tesseract.js (WASM-based OCR).
* Self-hosted, zero-config no external API keys required.
* Lower accuracy (~75%) compared to Azure DI (~95%).
*/
@Injectable()
export class TesseractExtractor implements K1Extractor {
private readonly logger = new Logger(TesseractExtractor.name);
private worker: any = null;
public constructor(
private readonly pdfParseExtractor: PdfParseExtractor
) {}
public isAvailable(): boolean {
return true; // Always available — WASM-based, no dependencies
}
public async extract(
buffer: Buffer,
fileName: string
): Promise<K1ExtractionResult> {
this.logger.log(`Extracting from scanned PDF via Tesseract.js: ${fileName}`);
const Tesseract = await import('tesseract.js');
// Create worker if not yet initialized
if (!this.worker) {
this.worker = await Tesseract.createWorker('eng');
}
// Tesseract.js works on images, so we need to convert PDF pages to images.
// For scanned PDFs, each page is typically a single image.
// We'll use pdf-parse to get the PDF info but perform OCR on the raw buffer.
let text = '';
let pageCount = 1;
try {
// Try to recognize text directly from the PDF buffer
// Tesseract.js can handle image buffers; for PDFs we extract what we can
const result = await this.worker.recognize(buffer);
text = result.data.text;
pageCount = 1;
} catch (error) {
this.logger.warn(
`Tesseract direct PDF recognition failed, trying alternative approach: ${error}`
);
// Fallback: try pdf-parse to at least get any embedded text
try {
const pdfParseModule = await import('pdf-parse');
const pdfParse = (pdfParseModule as any).default || pdfParseModule;
const parsed = await pdfParse(buffer);
text = parsed.text ?? '';
pageCount =
typeof parsed.numpages === 'number' && parsed.numpages > 0
? parsed.numpages
: 1;
} catch (parseError) {
this.logger.error(
`Both Tesseract and pdf-parse failed: ${parseError}`
);
text = '';
}
}
// Use regex-based extraction on the OCR'd text (same as pdf-parse extractor)
// but with lower base confidence since OCR text is less reliable
const fields = this.extractBoxValues(text);
const metadata = this.extractMetadata(text);
const totalConfidence = fields.reduce((sum, f) => sum + f.confidence, 0);
const overallConfidence =
fields.length > 0 ? totalConfidence / fields.length : 0;
return {
metadata,
fields,
unmappedItems: [],
overallConfidence: Math.round(overallConfidence * 100) / 100,
method: 'tesseract',
pagesProcessed: pageCount
};
}
private extractBoxValues(text: string): K1ExtractedField[] {
if (!text) return [];
// Reuse the same regex patterns as PdfParseExtractor but with lower confidence
const BOX_PATTERNS: Array<{ boxNumber: string; patterns: RegExp[] }> = [
{ boxNumber: '1', patterns: [/ordinary\s+business\s+income[^$\d-]*([($\d,.\-)]+)/i] },
{ boxNumber: '2', patterns: [/net\s+rental\s+real\s+estate[^$\d-]*([($\d,.\-)]+)/i] },
{ boxNumber: '3', patterns: [/other\s+net\s+rental[^$\d-]*([($\d,.\-)]+)/i] },
{ boxNumber: '4', patterns: [/guaranteed\s+payments?\s+for\s+services[^$\d-]*([($\d,.\-)]+)/i] },
{ boxNumber: '4a', patterns: [/guaranteed\s+payments?\s+for\s+capital[^$\d-]*([($\d,.\-)]+)/i] },
{ boxNumber: '4b', patterns: [/total\s+guaranteed\s+payments?[^$\d-]*([($\d,.\-)]+)/i] },
{ boxNumber: '5', patterns: [/interest\s+income[^$\d-]*([($\d,.\-)]+)/i] },
{ boxNumber: '6a', patterns: [/ordinary\s+dividends[^$\d-]*([($\d,.\-)]+)/i] },
{ boxNumber: '6b', patterns: [/qualified\s+dividends[^$\d-]*([($\d,.\-)]+)/i] },
{ boxNumber: '6c', patterns: [/dividend\s+equivalents[^$\d-]*([($\d,.\-)]+)/i] },
{ boxNumber: '7', patterns: [/royalties[^$\d-]*([($\d,.\-)]+)/i] },
{ boxNumber: '8', patterns: [/net\s+short[- ]term\s+capital[^$\d-]*([($\d,.\-)]+)/i] },
{ boxNumber: '9a', patterns: [/net\s+long[- ]term\s+capital[^$\d-]*([($\d,.\-)]+)/i] },
{ boxNumber: '9b', patterns: [/collectibles.*28%[^$\d-]*([($\d,.\-)]+)/i] },
{ boxNumber: '9c', patterns: [/unrecaptured\s+section\s*1250[^$\d-]*([($\d,.\-)]+)/i] },
{ boxNumber: '10', patterns: [/net\s+section\s*1231[^$\d-]*([($\d,.\-)]+)/i] },
{ boxNumber: '11', patterns: [/other\s+income[^$\d-]*([($\d,.\-)]+)/i] },
{ boxNumber: '12', patterns: [/section\s*179\s+deduction[^$\d-]*([($\d,.\-)]+)/i] },
{ boxNumber: '13', patterns: [/other\s+deductions[^$\d-]*([($\d,.\-)]+)/i] },
{ boxNumber: '14', patterns: [/self[- ]employment\s+earnings[^$\d-]*([($\d,.\-)]+)/i] },
{ boxNumber: '15', patterns: [/credits[^$\d-]*([($\d,.\-)]+)/i] },
{ boxNumber: '16', patterns: [/foreign\s+transactions[^$\d-]*([($\d,.\-)]+)/i] },
{ boxNumber: '17', patterns: [/alternative\s+minimum\s+tax[^$\d-]*([($\d,.\-)]+)/i] },
{ boxNumber: '18', patterns: [/tax[- ]exempt\s+income[^$\d-]*([($\d,.\-)]+)/i] },
{ boxNumber: '19a', patterns: [/distributions.*cash\s+and\s+marketable[^$\d-]*([($\d,.\-)]+)/i] },
{ boxNumber: '19b', patterns: [/distributions.*other\s+property[^$\d-]*([($\d,.\-)]+)/i] },
{ boxNumber: '20', patterns: [/other\s+information[^$\d-]*([($\d,.\-)]+)/i] },
{ boxNumber: '21', patterns: [/foreign\s+taxes\s+paid[^$\d-]*([($\d,.\-)]+)/i] }
];
const fields: K1ExtractedField[] = [];
for (const box of BOX_PATTERNS) {
for (const pattern of box.patterns) {
const match = text.match(pattern);
if (match) {
const rawValue = match[1].trim();
const numericValue = this.pdfParseExtractor.parseNumericValue(rawValue);
// Tesseract: lower base confidence of 0.65
let confidence = 0.65;
if (numericValue !== null) {
confidence += 0.1; // Value format validated
}
let confidenceLevel: 'HIGH' | 'MEDIUM' | 'LOW';
if (confidence >= 0.85) {
confidenceLevel = 'HIGH';
} else if (confidence >= 0.6) {
confidenceLevel = 'MEDIUM';
} else {
confidenceLevel = 'LOW';
}
fields.push({
boxNumber: box.boxNumber,
label: '',
customLabel: null,
rawValue,
numericValue,
confidence: Math.round(confidence * 100) / 100,
confidenceLevel,
isUserEdited: false,
isReviewed: false
});
break;
}
}
}
return fields;
}
private extractMetadata(text: string): K1ExtractionResult['metadata'] {
return {
partnershipName: this.extractPattern(
text,
/partnership['']s?\s+name[^:\n]*[:\s]+([^\n]{3,80})/i
),
partnershipEin: this.extractPattern(
text,
/partnership['']s?\s+(?:employer\s+identification\s+number|EIN)[^:\n]*[:\s]+(\d{2}[- ]\d{7})/i
),
partnerName: this.extractPattern(
text,
/partner['']s?\s+name[^:\n]*[:\s]+([^\n]{3,80})/i
),
partnerEin: this.extractPattern(
text,
/partner['']s?\s+(?:identifying|social\s+security)\s+number[^:\n]*[:\s]+(\d{2}[- ]\d{7}|\d{3}[- ]\d{2}[- ]\d{4})/i
),
taxYear: this.extractTaxYear(text),
isAmended: /amended/i.test(text),
isFinal: /final\s+k-?1/i.test(text) || /final\s+return/i.test(text)
};
}
private extractPattern(text: string, pattern: RegExp): string | null {
const match = text.match(pattern);
return match ? match[1].trim() : null;
}
private extractTaxYear(text: string): number | null {
const match = text.match(/(?:calendar\s+year|tax\s+year)\s*(\d{4})/i);
if (match) {
const year = parseInt(match[1], 10);
if (year >= 1900 && year <= 2100) return year;
}
return null;
}
}

96
apps/api/src/app/k1-import/k1-aggregation.service.ts

@ -0,0 +1,96 @@
import { PrismaService } from '@ghostfolio/api/services/prisma/prisma.service';
import type { K1AggregationResult } from '@ghostfolio/common/interfaces';
import { HttpException, Injectable } from '@nestjs/common';
import { StatusCodes, getReasonPhrase } from 'http-status-codes';
import { CellMappingService } from '../cell-mapping/cell-mapping.service';
/**
* Service for computing dynamic aggregation totals
* from CellAggregationRule records.
* FR-034, FR-039: Computed dynamically, only rules persisted.
*/
@Injectable()
export class K1AggregationService {
public constructor(
private readonly prismaService: PrismaService,
private readonly cellMappingService: CellMappingService
) {}
/**
* Compute aggregation results for a set of extracted/verified fields.
* Used during verification (live recalculation on cell edit) and
* after confirmation.
*/
public async computeFromFields(
fields: Array<{ boxNumber: string; numericValue: number | null }>,
partnershipId?: string
): Promise<K1AggregationResult[]> {
const rules =
await this.cellMappingService.getAggregationRules(partnershipId);
return rules.map((rule) => {
const sourceCells = (rule.sourceCells as string[]) || [];
const breakdown: Record<string, number> = {};
let computedValue = 0;
for (const boxNumber of sourceCells) {
const field = fields.find((f) => f.boxNumber === boxNumber);
const value = field?.numericValue ?? 0;
breakdown[boxNumber] = value;
if (rule.operation === 'SUM') {
computedValue += value;
}
}
return {
ruleId: rule.id,
name: rule.name,
operation: rule.operation,
sourceCells,
computedValue,
breakdown
};
});
}
/**
* Compute aggregation results for a KDocument (stored box values).
* GET /aggregation-rules/compute
*/
public async computeForKDocument(
kDocumentId: string,
partnershipId?: string
): Promise<K1AggregationResult[]> {
const kDocument = await this.prismaService.kDocument.findUnique({
where: { id: kDocumentId }
});
if (!kDocument) {
throw new HttpException(
getReasonPhrase(StatusCodes.NOT_FOUND),
StatusCodes.NOT_FOUND
);
}
// Extract box values from the KDocument data
const data = (kDocument.data as any) || {};
const fields: Array<{ boxNumber: string; numericValue: number | null }> =
[];
// kDocument.data stores box values as { "1": 50000, "9a": -1200, ... }
for (const [boxNumber, value] of Object.entries(data)) {
fields.push({
boxNumber,
numericValue: typeof value === 'number' ? value : null
});
}
return this.computeFromFields(
fields,
partnershipId || kDocument.partnershipId
);
}
}

92
apps/api/src/app/k1-import/k1-allocation.service.ts

@ -0,0 +1,92 @@
import { PrismaService } from '@ghostfolio/api/services/prisma/prisma.service';
import { Injectable, Logger } from '@nestjs/common';
import { Decimal } from '@prisma/client/runtime/library';
interface MemberAllocation {
entityId: string;
entityName: string;
ownershipPercent: number;
allocatedValues: Record<string, number>;
}
/**
* Service for allocating K-1 line items to partnership members
* by ownership percentage. FR-013.
* Rounding adjustment: residual cents assigned to the largest member (validation rule 8).
*/
@Injectable()
export class K1AllocationService {
private readonly logger = new Logger(K1AllocationService.name);
public constructor(private readonly prismaService: PrismaService) {}
/**
* Allocate K-1 box values to partnership members by ownership %.
* Returns allocations per member with proportional values.
*/
public async allocateToMembers(
partnershipId: string,
taxYear: number,
fields: Array<{ boxNumber: string; numericValue: number | null }>
): Promise<MemberAllocation[]> {
// Get active members as of tax year end
const taxYearEnd = new Date(taxYear, 11, 31); // Dec 31 of tax year
const memberships = await this.prismaService.partnershipMembership.findMany(
{
where: {
partnershipId,
effectiveDate: { lte: taxYearEnd },
OR: [{ endDate: null }, { endDate: { gte: taxYearEnd } }]
},
include: {
entity: true
},
orderBy: {
ownershipPercent: 'desc' // Largest member first for rounding
}
}
);
if (memberships.length === 0) {
return [];
}
const allocations: MemberAllocation[] = memberships.map((m) => ({
entityId: m.entityId,
entityName: m.entity.name || m.entityId,
ownershipPercent: new Decimal(m.ownershipPercent).toNumber(),
allocatedValues: {}
}));
// For each field with a numeric value, allocate proportionally
for (const field of fields) {
if (field.numericValue === null || field.numericValue === undefined) {
continue;
}
const totalCents = Math.round(field.numericValue * 100);
let allocatedCents = 0;
// Allocate to each member except the largest (first)
for (let i = 1; i < allocations.length; i++) {
const memberCents = Math.round(
(totalCents * allocations[i].ownershipPercent) / 100
);
allocations[i].allocatedValues[field.boxNumber] = memberCents / 100;
allocatedCents += memberCents;
}
// Largest member gets the remainder (rounding adjustment - validation rule 8)
allocations[0].allocatedValues[field.boxNumber] =
(totalCents - allocatedCents) / 100;
}
this.logger.log(
`Allocated ${fields.length} fields to ${memberships.length} members for partnership ${partnershipId}`
);
return allocations;
}
}

111
apps/api/src/app/k1-import/k1-confidence.service.ts

@ -0,0 +1,111 @@
import type { K1ExtractedField } from '@ghostfolio/common/interfaces';
import { Injectable } from '@nestjs/common';
/**
* K-1 confidence scoring service.
* Assigns three-level confidence (HIGH/MEDIUM/LOW) based on extraction method
* and validation heuristics per research.md Decision 5.
*/
@Injectable()
export class K1ConfidenceService {
/**
* Apply confidence scoring to extracted fields.
* Tier 1 (pdf-parse): Base 0.90, bonus for clean regex + valid format.
* Tier 2 (Azure/Tesseract): Use provider's native confidence score.
*/
public scoreFields(
fields: K1ExtractedField[],
_method: 'pdf-parse' | 'azure' | 'tesseract'
): K1ExtractedField[] {
const scored = fields.map((field) => ({
...field,
confidenceLevel: this.getConfidenceLevel(field.confidence)
}));
// Apply cross-field validation rules
return this.applyCrossFieldValidation(scored);
}
/**
* Map numeric confidence to three-level display.
* HIGH (>= 0.85): Green no user attention needed
* MEDIUM (0.600.84): Yellow optional review
* LOW (< 0.60): Red requires manual review
*/
public getConfidenceLevel(
confidence: number
): 'HIGH' | 'MEDIUM' | 'LOW' {
if (confidence >= 0.85) return 'HIGH';
if (confidence >= 0.6) return 'MEDIUM';
return 'LOW';
}
/**
* Calculate overall extraction confidence.
*/
public calculateOverallConfidence(fields: K1ExtractedField[]): number {
if (fields.length === 0) return 0;
const sum = fields.reduce((acc, f) => acc + f.confidence, 0);
return Math.round((sum / fields.length) * 100) / 100;
}
/**
* Apply cross-field validation heuristics:
* - Box 6b (Qualified dividends) <= Box 6a (Ordinary dividends)
* - Sub-boxes should sum to parent where applicable
* Fields that fail validation get confidence reduced.
*/
private applyCrossFieldValidation(
fields: K1ExtractedField[]
): K1ExtractedField[] {
const fieldMap = new Map(fields.map((f) => [f.boxNumber, f]));
// Rule: Box 6b <= Box 6a
const box6a = fieldMap.get('6a');
const box6b = fieldMap.get('6b');
if (
box6a?.numericValue != null &&
box6b?.numericValue != null &&
box6b.numericValue > box6a.numericValue
) {
// Reduce confidence on 6b — possible extraction error
box6b.confidence = Math.max(box6b.confidence - 0.2, 0);
box6b.confidenceLevel = this.getConfidenceLevel(box6b.confidence);
}
// Rule: Box 4b (total guaranteed) should approximately equal
// Box 4 (services) + Box 4a (capital) if all three are present
const box4 = fieldMap.get('4');
const box4a = fieldMap.get('4a');
const box4b = fieldMap.get('4b');
if (
box4?.numericValue != null &&
box4a?.numericValue != null &&
box4b?.numericValue != null
) {
const expectedTotal = box4.numericValue + box4a.numericValue;
const diff = Math.abs(box4b.numericValue - expectedTotal);
// Allow 1% tolerance for rounding
if (diff > Math.abs(expectedTotal * 0.01) + 1) {
box4b.confidence = Math.max(box4b.confidence - 0.15, 0);
box4b.confidenceLevel = this.getConfidenceLevel(box4b.confidence);
}
}
return fields;
}
/**
* Auto-set isReviewed for high-confidence fields per Decision 12.
* High-confidence values are auto-accepted (pre-checked).
* Medium/low require explicit user review.
*/
public applyAutoReview(fields: K1ExtractedField[]): K1ExtractedField[] {
return fields.map((field) => ({
...field,
isReviewed: field.isReviewed || field.confidenceLevel === 'HIGH'
}));
}
}

160
apps/api/src/app/k1-import/k1-field-mapper.service.ts

@ -0,0 +1,160 @@
import type { K1ExtractedField, K1ExtractionResult, K1UnmappedItem } from '@ghostfolio/common/interfaces';
import { Injectable, Logger } from '@nestjs/common';
import { CellMappingService } from '../cell-mapping/cell-mapping.service';
import { K1ConfidenceService } from './k1-confidence.service';
/**
* Maps raw extraction results to K-1 box fields using cell mapping configuration.
* Applies labels from cell mappings, scoring, and auto-review logic.
*/
@Injectable()
export class K1FieldMapperService {
private readonly logger = new Logger(K1FieldMapperService.name);
public constructor(
private readonly cellMappingService: CellMappingService,
private readonly confidenceService: K1ConfidenceService
) {}
/**
* Map raw extraction results to fully labeled K1ExtractedFields using cell mappings.
* Also identifies unmapped items (extracted values that don't match any cell mapping).
*/
public async mapFields(
extractionResult: K1ExtractionResult,
partnershipId: string
): Promise<K1ExtractionResult> {
// Load cell mappings for this partnership (with global fallback)
const mappings = await this.cellMappingService.getMappings(partnershipId);
const mappingMap = new Map(mappings.map((m) => [m.boxNumber, m]));
const mappedFields: K1ExtractedField[] = [];
const unmappedItems: K1UnmappedItem[] = [
...extractionResult.unmappedItems
];
for (const field of extractionResult.fields) {
const mapping = mappingMap.get(field.boxNumber);
if (mapping) {
// Skip ignored fields — they are filtered out of extraction results
if (mapping.isIgnored) {
this.logger.debug(
`Skipping ignored field: box ${field.boxNumber}`
);
continue;
}
mappedFields.push({
...field,
label: mapping.label,
cellType: mapping.cellType
} as any);
} else {
// Field has a box number but no corresponding cell mapping
this.logger.debug(
`No cell mapping for box ${field.boxNumber}, adding to unmapped items`
);
unmappedItems.push({
rawLabel: field.label || `Box ${field.boxNumber}`,
rawValue: field.rawValue,
numericValue: field.numericValue,
confidence: field.confidence,
pageNumber: 1, // Default page number when unknown
resolution: null,
assignedBoxNumber: null
});
}
}
// Sort mapped fields by the cell mapping sort order
const sortedFields = mappedFields.sort((a, b) => {
const sortA = mappingMap.get(a.boxNumber)?.sortOrder ?? 999;
const sortB = mappingMap.get(b.boxNumber)?.sortOrder ?? 999;
return sortA - sortB;
});
// Apply confidence scoring
const scoredFields = this.confidenceService.scoreFields(
sortedFields,
extractionResult.method
);
// Apply auto-review (high-confidence auto-accepted)
const reviewedFields = this.confidenceService.applyAutoReview(scoredFields);
// Recalculate overall confidence
const overallConfidence =
this.confidenceService.calculateOverallConfidence(reviewedFields);
return {
...extractionResult,
fields: reviewedFields,
unmappedItems,
overallConfidence
};
}
/**
* Add any mapped cell mapping boxes that were NOT extracted as zero-value fields.
* This ensures the verification screen shows all expected K-1 boxes.
*/
public async fillMissingBoxes(
result: K1ExtractionResult,
partnershipId: string
): Promise<K1ExtractionResult> {
const mappings = await this.cellMappingService.getMappings(partnershipId);
const existingBoxes = new Set(result.fields.map((f) => f.boxNumber));
const missingFields: K1ExtractedField[] = [];
for (const mapping of mappings) {
// Skip ignored mappings — don't generate empty placeholder rows
if (mapping.isIgnored) {
continue;
}
if (!existingBoxes.has(mapping.boxNumber)) {
missingFields.push({
boxNumber: mapping.boxNumber,
label: mapping.label,
customLabel: null,
rawValue: '',
numericValue: null,
confidence: 1.0, // Empty fields have full confidence
confidenceLevel: 'HIGH',
isUserEdited: false,
isReviewed: true, // No review needed for empty fields
cellType: mapping.cellType
} as any);
}
}
return {
...result,
fields: [...result.fields, ...missingFields].sort((a, b) => {
// Sort by natural box number order
return this.compareBoxNumbers(a.boxNumber, b.boxNumber);
})
};
}
/**
* Compare box numbers for natural ordering (1, 2, 3, 4, 4a, 4b, 5, 6a, ...).
*/
private compareBoxNumbers(a: string, b: string): number {
const parseBox = (box: string) => {
const match = box.match(/^(\d+)([a-z]?)$/);
if (!match) return { num: 999, sub: box };
return { num: parseInt(match[1], 10), sub: match[2] || '' };
};
const pa = parseBox(a);
const pb = parseBox(b);
if (pa.num !== pb.num) return pa.num - pb.num;
return pa.sub.localeCompare(pb.sub);
}
}

138
apps/api/src/app/k1-import/k1-import.controller.ts

@ -0,0 +1,138 @@
import { HasPermission } from '@ghostfolio/api/decorators/has-permission.decorator';
import { HasPermissionGuard } from '@ghostfolio/api/guards/has-permission.guard';
import { permissions } from '@ghostfolio/common/permissions';
import type { RequestWithUser } from '@ghostfolio/common/types';
import {
Body,
Controller,
Get,
HttpCode,
Inject,
Param,
Post,
Put,
Query,
UploadedFile,
UseGuards,
UseInterceptors
} from '@nestjs/common';
import { REQUEST } from '@nestjs/core';
import { AuthGuard } from '@nestjs/passport';
import { FileInterceptor } from '@nestjs/platform-express';
import { StatusCodes } from 'http-status-codes';
import { ConfirmK1Dto } from './dto/confirm-k1.dto';
import { VerifyK1Dto } from './dto/verify-k1.dto';
import { K1ImportService } from './k1-import.service';
@Controller('k1-import')
export class K1ImportController {
public constructor(
private readonly k1ImportService: K1ImportService,
@Inject(REQUEST) private readonly request: RequestWithUser
) {}
/**
* POST /api/v1/k1-import/upload
* Upload a K-1 PDF and initiate extraction.
*/
@HasPermission(permissions.createKDocument)
@Post('upload')
@HttpCode(StatusCodes.CREATED)
@UseGuards(AuthGuard('jwt'), HasPermissionGuard)
@UseInterceptors(FileInterceptor('file'))
public async uploadK1(@UploadedFile() file: any) {
const body = this.request.body as any;
const taxYear = parseInt(body.taxYear, 10);
return this.k1ImportService.uploadAndExtract({
file,
partnershipId: body.partnershipId,
taxYear,
userId: this.request.user.id
});
}
/**
* GET /api/v1/k1-import/history
* Get import history for a partnership.
*/
@HasPermission(permissions.readKDocument)
@Get('history')
@UseGuards(AuthGuard('jwt'), HasPermissionGuard)
public async getImportHistory(
@Query('partnershipId') partnershipId: string,
@Query('taxYear') taxYear?: string
) {
return this.k1ImportService.getHistory(
this.request.user.id,
partnershipId,
taxYear ? parseInt(taxYear, 10) : undefined
);
}
/**
* GET /api/v1/k1-import/:id
* Get the current state of an import session.
*/
@HasPermission(permissions.readKDocument)
@Get(':id')
@UseGuards(AuthGuard('jwt'), HasPermissionGuard)
public async getImportSession(@Param('id') id: string) {
return this.k1ImportService.getSession(id, this.request.user.id);
}
/**
* PUT /api/v1/k1-import/:id/verify
* Submit user-verified extraction data.
*/
@HasPermission(permissions.updateKDocument)
@Put(':id/verify')
@UseGuards(AuthGuard('jwt'), HasPermissionGuard)
public async verifyImportSession(
@Param('id') id: string,
@Body() data: VerifyK1Dto
) {
return this.k1ImportService.verify(id, this.request.user.id, data);
}
/**
* POST /api/v1/k1-import/:id/cancel
* Cancel an import session.
*/
@HasPermission(permissions.updateKDocument)
@Post(':id/cancel')
@HttpCode(StatusCodes.OK)
@UseGuards(AuthGuard('jwt'), HasPermissionGuard)
public async cancelImportSession(@Param('id') id: string) {
return this.k1ImportService.cancel(id, this.request.user.id);
}
/**
* POST /api/v1/k1-import/:id/reprocess
* Re-process a previously uploaded K-1 PDF with current cell mapping.
*/
@HasPermission(permissions.updateKDocument)
@Post(':id/reprocess')
@HttpCode(StatusCodes.OK)
@UseGuards(AuthGuard('jwt'), HasPermissionGuard)
public async reprocessImportSession(@Param('id') id: string) {
return this.k1ImportService.reprocess(id, this.request.user.id);
}
/**
* POST /api/v1/k1-import/:id/confirm
* Confirm verified data and trigger auto-creation of model objects.
*/
@HasPermission(permissions.createKDocument)
@Post(':id/confirm')
@HttpCode(StatusCodes.CREATED)
@UseGuards(AuthGuard('jwt'), HasPermissionGuard)
public async confirmImportSession(
@Param('id') id: string,
@Body() data: ConfirmK1Dto
) {
return this.k1ImportService.confirm(id, this.request.user.id, data);
}
}

68
apps/api/src/app/k1-import/k1-import.module.ts

@ -0,0 +1,68 @@
import { ConfigurationModule } from '@ghostfolio/api/services/configuration/configuration.module';
import { PrismaModule } from '@ghostfolio/api/services/prisma/prisma.module';
import { Module } from '@nestjs/common';
import { MulterModule } from '@nestjs/platform-express';
import { diskStorage } from 'multer';
import { existsSync, mkdirSync } from 'node:fs';
import { join } from 'node:path';
import { v4 as uuidv4 } from 'uuid';
import { CellMappingModule } from '../cell-mapping/cell-mapping.module';
import { UploadModule } from '../upload/upload.module';
import { K1ImportController } from './k1-import.controller';
import { K1ImportService } from './k1-import.service';
import { K1AggregationService } from './k1-aggregation.service';
import { K1AllocationService } from './k1-allocation.service';
import { K1ConfidenceService } from './k1-confidence.service';
import { K1FieldMapperService } from './k1-field-mapper.service';
import { AzureExtractor } from './extractors/azure-extractor';
import { PdfParseExtractor } from './extractors/pdf-parse-extractor';
import { TesseractExtractor } from './extractors/tesseract-extractor';
const uploadDir = process.env.UPLOAD_DIR || join(process.cwd(), 'uploads');
@Module({
controllers: [K1ImportController],
exports: [K1ImportService],
imports: [
CellMappingModule,
ConfigurationModule,
MulterModule.register({
limits: {
fileSize: 25 * 1024 * 1024 // 25 MB
},
storage: diskStorage({
destination: (_req, _file, cb) => {
const now = new Date();
const yearDir = now.getFullYear().toString();
const monthDir = (now.getMonth() + 1).toString().padStart(2, '0');
const subDir = join(uploadDir, yearDir, monthDir);
if (!existsSync(subDir)) {
mkdirSync(subDir, { recursive: true });
}
cb(null, subDir);
},
filename: (_req, file, cb) => {
const ext = file.originalname.split('.').pop();
cb(null, `${uuidv4()}.${ext}`);
}
})
}),
PrismaModule,
UploadModule
],
providers: [
AzureExtractor,
K1AggregationService,
K1AllocationService,
K1ConfidenceService,
K1FieldMapperService,
K1ImportService,
PdfParseExtractor,
TesseractExtractor
]
})
export class K1ImportModule {}

903
apps/api/src/app/k1-import/k1-import.service.ts

@ -0,0 +1,903 @@
import { PrismaService } from '@ghostfolio/api/services/prisma/prisma.service';
import type { K1ExtractionResult } from '@ghostfolio/common/interfaces';
import { HttpException, Injectable, Logger } from '@nestjs/common';
import { K1ImportStatus, KDocumentStatus } from '@prisma/client';
import { StatusCodes, getReasonPhrase } from 'http-status-codes';
import { readFile } from 'node:fs/promises';
import { join } from 'node:path';
import { UploadService } from '../upload/upload.service';
import { AzureExtractor } from './extractors/azure-extractor';
import { PdfParseExtractor } from './extractors/pdf-parse-extractor';
import { TesseractExtractor } from './extractors/tesseract-extractor';
import { K1AllocationService } from './k1-allocation.service';
import { K1FieldMapperService } from './k1-field-mapper.service';
const MAX_FILE_SIZE = 25 * 1024 * 1024; // 25 MB
@Injectable()
export class K1ImportService {
private readonly logger = new Logger(K1ImportService.name);
public constructor(
private readonly prismaService: PrismaService,
private readonly uploadService: UploadService,
private readonly fieldMapperService: K1FieldMapperService,
private readonly allocationService: K1AllocationService,
private readonly pdfParseExtractor: PdfParseExtractor,
private readonly azureExtractor: AzureExtractor,
private readonly tesseractExtractor: TesseractExtractor
) {}
/**
* Upload a K-1 PDF and initiate extraction.
* FR-001, FR-003, FR-005, FR-028
*/
public async uploadAndExtract({
file,
partnershipId,
taxYear,
userId
}: {
file: any;
partnershipId: string;
taxYear: number;
userId: string;
}) {
// Validate PDF MIME type (FR-003)
if (file.mimetype !== 'application/pdf') {
throw new HttpException(
'File is not a valid PDF',
StatusCodes.BAD_REQUEST
);
}
// Validate file size (FR-028)
if (file.size > MAX_FILE_SIZE) {
throw new HttpException(
'File exceeds 25 MB size limit',
StatusCodes.BAD_REQUEST
);
}
// Validate partnership exists and belongs to user
const partnership = await this.prismaService.partnership.findFirst({
where: {
id: partnershipId,
userId
},
include: {
members: {
where: {
endDate: null
}
}
}
});
if (!partnership) {
throw new HttpException(
'Partnership not found or not owned by user',
StatusCodes.BAD_REQUEST
);
}
if (!partnership.members || partnership.members.length === 0) {
throw new HttpException(
'Partnership has no active members',
StatusCodes.BAD_REQUEST
);
}
// Validate tax year >= partnership inception year
if (partnership.inceptionDate) {
const inceptionYear = new Date(partnership.inceptionDate).getFullYear();
if (taxYear < inceptionYear) {
throw new HttpException(
`Tax year must be >= partnership inception year (${inceptionYear})`,
StatusCodes.BAD_REQUEST
);
}
}
// Create Document record for the uploaded PDF
const document = await this.uploadService.createDocument({
file,
partnershipId,
taxYear,
type: 'K1',
name: file.originalname
});
// Create import session in PROCESSING status
const session = await this.prismaService.k1ImportSession.create({
data: {
partnershipId,
userId,
status: K1ImportStatus.PROCESSING,
taxYear,
fileName: file.originalname,
fileSize: file.size,
extractionMethod: 'pending',
documentId: document.id
}
});
// Run extraction asynchronously (don't block the response)
this.runExtraction(session.id, file, partnershipId).catch((err) => {
this.logger.error(
`Extraction failed for session ${session.id}: ${err.message}`,
err.stack
);
});
return {
id: session.id,
partnershipId: session.partnershipId,
status: session.status,
taxYear: session.taxYear,
fileName: session.fileName,
fileSize: session.fileSize,
extractionMethod: session.extractionMethod,
createdAt: session.createdAt
};
}
/**
* Get an import session by ID with ownership check.
*/
public async getSession(sessionId: string, userId: string) {
const session = await this.prismaService.k1ImportSession.findUnique({
where: { id: sessionId }
});
if (!session) {
throw new HttpException(
getReasonPhrase(StatusCodes.NOT_FOUND),
StatusCodes.NOT_FOUND
);
}
if (session.userId !== userId) {
throw new HttpException(
getReasonPhrase(StatusCodes.FORBIDDEN),
StatusCodes.FORBIDDEN
);
}
return session;
}
/**
* Run the two-tier extraction pipeline.
* Tier 1: pdf-parse (for digital PDFs)
* Tier 2: Azure DI or tesseract.js (for scanned PDFs)
*/
private async runExtraction(
sessionId: string,
file: any,
partnershipId: string
) {
try {
// Read the file buffer
const uploadDir = this.uploadService.getUploadDir();
const doc = await this.prismaService.k1ImportSession.findUnique({
where: { id: sessionId },
include: { document: true }
});
let buffer: Buffer;
if (doc?.document?.filePath) {
const fullPath = join(uploadDir, doc.document.filePath);
buffer = await readFile(fullPath);
} else if (file.path) {
buffer = await readFile(file.path);
} else if (file.buffer) {
buffer = file.buffer;
} else {
throw new Error('No file buffer available');
}
// Check for password-protected PDFs (FR-029)
await this.checkPasswordProtected(buffer);
// Tier 1: Try pdf-parse for digital PDFs
let extractionResult: K1ExtractionResult;
let method: string;
const isDigital = await this.pdfParseExtractor.isDigitalK1(buffer);
if (isDigital) {
this.logger.log(`Session ${sessionId}: Digital K-1 detected, using pdf-parse`);
extractionResult = await this.pdfParseExtractor.extract(
buffer,
doc?.fileName || 'unknown.pdf'
);
method = 'pdf-parse';
} else {
// Tier 2: Scanned PDF — try Azure first, fall back to tesseract
if (this.azureExtractor.isAvailable()) {
this.logger.log(`Session ${sessionId}: Scanned K-1, using Azure DI`);
extractionResult = await this.azureExtractor.extract(
buffer,
doc?.fileName || 'unknown.pdf'
);
method = 'azure';
} else {
this.logger.log(
`Session ${sessionId}: Scanned K-1, using tesseract.js (Azure not configured)`
);
extractionResult = await this.tesseractExtractor.extract(
buffer,
doc?.fileName || 'unknown.pdf'
);
method = 'tesseract';
}
}
// Map fields using cell mapping configuration
const mappedResult = await this.fieldMapperService.mapFields(
extractionResult,
partnershipId
);
// Fill in missing boxes (empty values for unmapped IRS boxes)
const completeResult = await this.fieldMapperService.fillMissingBoxes(
mappedResult,
partnershipId
);
// Generate edge case warnings (FR-029, Edge Cases 3-6)
const warnings = await this.generateWarnings(
sessionId,
completeResult,
partnershipId,
buffer
);
if (warnings.length > 0) {
this.logger.warn(
`Session ${sessionId}: ${warnings.length} warning(s) detected: ${warnings.join('; ')}`
);
}
// Update session with extraction results and warnings
await this.prismaService.k1ImportSession.update({
where: { id: sessionId },
data: {
status: K1ImportStatus.EXTRACTED,
extractionMethod: method,
rawExtraction: {
...completeResult,
warnings
} as any
}
});
this.logger.log(
`Session ${sessionId}: Extraction complete (${method}), ${completeResult.fields.length} fields, confidence ${completeResult.overallConfidence}`
);
} catch (error) {
this.logger.error(
`Session ${sessionId}: Extraction failed: ${error.message}`,
error.stack
);
await this.prismaService.k1ImportSession.update({
where: { id: sessionId },
data: {
status: K1ImportStatus.FAILED,
errorMessage: error.message || 'Extraction failed'
}
});
}
}
/**
* Verify extraction results.
* EXTRACTED VERIFIED transition.
* FR-006 through FR-010, FR-035 (block if unreviewed medium/low), validation rule 10
*/
public async verify(
sessionId: string,
userId: string,
data: {
taxYear: number;
fields: any[];
unmappedItems?: any[];
}
) {
const session = await this.getSession(sessionId, userId);
// Only EXTRACTED sessions can be verified
if (session.status !== K1ImportStatus.EXTRACTED) {
throw new HttpException(
'Session must be in EXTRACTED status to verify',
StatusCodes.BAD_REQUEST
);
}
// Validate fields not empty
if (!data.fields || data.fields.length === 0) {
throw new HttpException(
'Fields array cannot be empty',
StatusCodes.BAD_REQUEST
);
}
// FR-035: All medium/low-confidence fields must be reviewed
const unreviewedFields = data.fields.filter(
(f) =>
(f.confidenceLevel === 'MEDIUM' || f.confidenceLevel === 'LOW') &&
!f.isReviewed
);
if (unreviewedFields.length > 0) {
throw new HttpException(
`${unreviewedFields.length} medium/low-confidence fields have not been reviewed`,
StatusCodes.BAD_REQUEST
);
}
// Validation rule 10: All unmapped items must be resolved
if (data.unmappedItems && data.unmappedItems.length > 0) {
const unresolvedItems = data.unmappedItems.filter(
(item) => !item.resolution || item.resolution === null
);
if (unresolvedItems.length > 0) {
throw new HttpException(
`${unresolvedItems.length} unmapped items have not been resolved`,
StatusCodes.BAD_REQUEST
);
}
}
// Transition to VERIFIED and store verified data
const updated = await this.prismaService.k1ImportSession.update({
where: { id: sessionId },
data: {
status: K1ImportStatus.VERIFIED,
taxYear: data.taxYear,
verifiedData: {
fields: data.fields,
unmappedItems: data.unmappedItems || []
} as any
}
});
this.logger.log(
`Session ${sessionId}: Verified with ${data.fields.length} fields`
);
return updated;
}
/**
* Cancel an import session.
* FR-011: Discard extraction data, status CANCELLED.
*/
public async cancel(sessionId: string, userId: string) {
const session = await this.getSession(sessionId, userId);
// Cannot cancel already CONFIRMED or CANCELLED sessions
if (
session.status === K1ImportStatus.CONFIRMED ||
session.status === K1ImportStatus.CANCELLED
) {
throw new HttpException(
`Cannot cancel a session in ${session.status} status`,
StatusCodes.BAD_REQUEST
);
}
const updated = await this.prismaService.k1ImportSession.update({
where: { id: sessionId },
data: {
status: K1ImportStatus.CANCELLED
}
});
this.logger.log(`Session ${sessionId}: Cancelled`);
return updated;
}
/**
* Get import history for a partnership, optionally filtered by tax year.
* FR-022: History of all K-1 import attempts per partnership.
*/
public async getHistory(
userId: string,
partnershipId: string,
taxYear?: number
) {
const where: any = { userId, partnershipId };
if (taxYear) {
where.taxYear = taxYear;
}
const sessions = await this.prismaService.k1ImportSession.findMany({
where,
orderBy: { createdAt: 'desc' },
select: {
id: true,
partnershipId: true,
status: true,
taxYear: true,
fileName: true,
extractionMethod: true,
kDocumentId: true,
createdAt: true
}
});
return sessions;
}
/**
* Re-process a previously uploaded K-1 PDF with the current cell mapping.
* FR-023: Creates a new import session using the stored document from the original session.
*/
public async reprocess(sessionId: string, userId: string) {
const originalSession = await this.getSession(sessionId, userId);
if (!originalSession.documentId) {
throw new HttpException(
'Original session has no stored document to re-process',
StatusCodes.BAD_REQUEST
);
}
// Read the stored file from uploads directory
const document = await this.prismaService.document.findUnique({
where: { id: originalSession.documentId }
});
if (!document) {
throw new HttpException(
'Stored document not found',
StatusCodes.NOT_FOUND
);
}
// Create a new import session in PROCESSING status
const newSession = await this.prismaService.k1ImportSession.create({
data: {
partnershipId: originalSession.partnershipId,
userId,
status: K1ImportStatus.PROCESSING,
taxYear: originalSession.taxYear,
fileName: originalSession.fileName,
fileSize: originalSession.fileSize,
extractionMethod: 'pending',
documentId: originalSession.documentId
}
});
// Read file from disk and run extraction asynchronously
const fs = await import('fs/promises');
const relativePath = (document as any).filePath;
if (!relativePath) {
throw new HttpException(
'Cannot determine file path for stored document',
StatusCodes.INTERNAL_SERVER_ERROR
);
}
const uploadDir = this.uploadService.getUploadDir();
const fullPath = join(uploadDir, relativePath);
const fileBuffer = await fs.readFile(fullPath);
const file = {
buffer: fileBuffer,
originalname: originalSession.fileName,
mimetype: 'application/pdf',
size: originalSession.fileSize
};
this.runExtraction(
newSession.id,
file,
originalSession.partnershipId
).catch((err) => {
this.logger.error(
`Reprocess extraction failed for session ${newSession.id}: ${err.message}`,
err.stack
);
});
this.logger.log(
`Session ${sessionId}: Re-processing started as new session ${newSession.id}`
);
return {
id: newSession.id,
partnershipId: newSession.partnershipId,
status: newSession.status,
taxYear: newSession.taxYear,
fileName: newSession.fileName,
fileSize: newSession.fileSize,
extractionMethod: newSession.extractionMethod,
createdAt: newSession.createdAt
};
}
/**
* Confirm verified data and auto-create model objects.
* VERIFIED CONFIRMED transition.
* FR-012 (KDocument), FR-013 (allocations), FR-014 (Distributions), FR-015 (Document linkage), FR-016 (duplicate detection).
*/
public async confirm(
sessionId: string,
userId: string,
data: {
filingStatus: KDocumentStatus;
existingKDocumentAction?: 'UPDATE' | 'CREATE_NEW';
}
) {
const session = await this.getSession(sessionId, userId);
// Only VERIFIED sessions can be confirmed
if (session.status !== K1ImportStatus.VERIFIED) {
throw new HttpException(
'Session must be in VERIFIED status to confirm',
StatusCodes.BAD_REQUEST
);
}
const verifiedData = session.verifiedData as any;
if (!verifiedData?.fields || verifiedData.fields.length === 0) {
throw new HttpException(
'No verified data available',
StatusCodes.BAD_REQUEST
);
}
// Check for active members (FR-013)
const memberships =
await this.prismaService.partnershipMembership.findMany({
where: {
partnershipId: session.partnershipId,
effectiveDate: {
lte: new Date(session.taxYear, 11, 31)
},
OR: [
{ endDate: null },
{ endDate: { gte: new Date(session.taxYear, 11, 31) } }
]
},
include: { entity: true }
});
if (memberships.length === 0) {
throw new HttpException(
'Partnership has no active members',
StatusCodes.BAD_REQUEST
);
}
// Edge Case 7: Ownership % change handling
// Compare current memberships with tax year end memberships
const confirmWarnings: string[] = [];
const currentMemberships =
await this.prismaService.partnershipMembership.findMany({
where: {
partnershipId: session.partnershipId,
endDate: null
},
include: { entity: true }
});
for (const taxYearMember of memberships) {
const currentMember = currentMemberships.find(
(cm) => cm.entityId === taxYearMember.entityId
);
if (!currentMember) {
confirmWarnings.push(
`Member ${taxYearMember.entity?.name || taxYearMember.entityId} was active at tax year end (${session.taxYear}) but is no longer an active member.`
);
} else if (
(currentMember as any).ownershipPercent !==
(taxYearMember as any).ownershipPercent
) {
confirmWarnings.push(
`Ownership for ${taxYearMember.entity?.name || taxYearMember.entityId} changed from ${(taxYearMember as any).ownershipPercent}% (tax year ${session.taxYear}) to ${(currentMember as any).ownershipPercent}% (current). Allocations use the tax year end percentage.`
);
}
}
if (confirmWarnings.length > 0) {
this.logger.warn(
`Session ${sessionId}: Confirm warnings: ${confirmWarnings.join('; ')}`
);
}
// FR-016: Check for existing KDocument (duplicate detection)
const existingKDocument = await this.prismaService.kDocument.findUnique({
where: {
partnershipId_type_taxYear: {
partnershipId: session.partnershipId,
type: 'K1',
taxYear: session.taxYear
}
}
});
if (existingKDocument && !data.existingKDocumentAction) {
throw new HttpException(
'A KDocument already exists for this partnership, type, and tax year. Specify existingKDocumentAction (UPDATE or CREATE_NEW).',
StatusCodes.CONFLICT
);
}
// Build KDocument data from verified fields
const kDocumentData: Record<string, number | string | null> = {};
for (const field of verifiedData.fields) {
// For subtype fields (e.g., box 11 "ZZ*", box 20 "A"), create unique key
const key = field.subtype
? `${field.boxNumber}-${field.subtype}`
: field.boxNumber;
// Persist numericValue for numeric fields, rawValue for text/checkbox/string fields
kDocumentData[key] = field.numericValue ?? field.rawValue ?? null;
}
// FR-012: Create or update KDocument
let kDocument;
if (existingKDocument && data.existingKDocumentAction === 'UPDATE') {
// FR-025: Preserve previous values for audit trail
const previousData = existingKDocument.data;
const previousFilingStatus = existingKDocument.filingStatus;
kDocument = await this.prismaService.kDocument.update({
where: { id: existingKDocument.id },
data: {
filingStatus: data.filingStatus,
data: kDocumentData as any,
previousData: previousData as any,
previousFilingStatus,
documentFileId: session.documentId
}
});
} else {
// CREATE_NEW or no existing document
if (existingKDocument && data.existingKDocumentAction === 'CREATE_NEW') {
// Delete existing unique constraint holder to create new
await this.prismaService.kDocument.delete({
where: { id: existingKDocument.id }
});
}
kDocument = await this.prismaService.kDocument.create({
data: {
partnershipId: session.partnershipId,
type: 'K1',
taxYear: session.taxYear,
filingStatus: data.filingStatus,
data: kDocumentData as any,
documentFileId: session.documentId
}
});
}
// FR-013: Allocate K-1 amounts to members
const allocations = await this.allocationService.allocateToMembers(
session.partnershipId,
session.taxYear,
verifiedData.fields
);
// FR-014: Create Distribution records for Box 19a and Box 19b
const distributions: any[] = [];
const distributionDate = new Date(session.taxYear, 11, 31); // Dec 31
for (const allocation of allocations) {
// Box 19a: Cash and marketable securities
const box19a = allocation.allocatedValues['19a'];
if (box19a && box19a !== 0) {
const dist = await this.prismaService.distribution.create({
data: {
partnershipId: session.partnershipId,
entityId: allocation.entityId,
type: 'RETURN_OF_CAPITAL',
amount: box19a,
date: distributionDate,
currency: 'USD',
notes: `K-1 Box 19a (Cash distributions) - Tax Year ${session.taxYear}`
}
});
distributions.push(dist);
}
// Box 19b: Other property distributions
const box19b = allocation.allocatedValues['19b'];
if (box19b && box19b !== 0) {
const dist = await this.prismaService.distribution.create({
data: {
partnershipId: session.partnershipId,
entityId: allocation.entityId,
type: 'RETURN_OF_CAPITAL',
amount: box19b,
date: distributionDate,
currency: 'USD',
notes: `K-1 Box 19b (Property distributions) - Tax Year ${session.taxYear}`
}
});
distributions.push(dist);
}
}
// Update session to CONFIRMED and link KDocument
await this.prismaService.k1ImportSession.update({
where: { id: sessionId },
data: {
status: K1ImportStatus.CONFIRMED,
kDocumentId: kDocument.id
}
});
this.logger.log(
`Session ${sessionId}: Confirmed. KDocument ${kDocument.id} created, ${distributions.length} distributions, ${allocations.length} member allocations`
);
return {
importSession: {
id: sessionId,
status: 'CONFIRMED'
},
kDocument: {
id: kDocument.id,
partnershipId: kDocument.partnershipId,
type: kDocument.type,
taxYear: kDocument.taxYear,
filingStatus: kDocument.filingStatus,
data: kDocument.data
},
distributions,
allocations: allocations.map((a) => ({
entityId: a.entityId,
entityName: a.entityName,
ownershipPercent: a.ownershipPercent,
allocatedValues: a.allocatedValues
})),
document: session.documentId
? { id: session.documentId, type: 'K1', name: session.fileName }
: null,
warnings: confirmWarnings
};
}
/**
* Check if a PDF is password-protected (FR-029).
*/
private async checkPasswordProtected(buffer: Buffer): Promise<void> {
try {
const { PDFParse } = await import('pdf-parse');
const parser = new PDFParse({ data: buffer });
await parser.getText();
} catch (error) {
if (
error?.message?.includes('password') ||
error?.message?.includes('encrypted')
) {
throw new HttpException(
'Password-protected PDFs are not supported',
StatusCodes.BAD_REQUEST
);
}
// Other parse errors are not password-related, continue
}
}
/**
* Detect if a PDF contains multiple K-1 forms for different entities (Edge Case 5).
* Counts occurrences of "Schedule K-1" headers and unique EINs to detect multi-entity PDFs.
*/
private async detectMultiEntityPdf(buffer: Buffer): Promise<{
isMultiEntity: boolean;
entityCount: number;
}> {
try {
const { PDFParse } = await import('pdf-parse');
const parser = new PDFParse({ data: buffer });
const parsed = await parser.getText();
const text = parsed.text || '';
// Count "Schedule K-1" header occurrences
const k1HeaderMatches = text.match(/Schedule\s+K-1/gi) || [];
// Count unique EINs (XX-XXXXXXX format)
const einMatches = text.match(/\d{2}-\d{7}/g) || [];
const uniqueEins = new Set(einMatches);
// If multiple K-1 headers or >2 unique EINs (partnership + multiple partners)
const entityCount = Math.max(
Math.floor(k1HeaderMatches.length / 2), // K-1 header appears in header and footer
uniqueEins.size > 2 ? uniqueEins.size - 1 : 1
);
return {
isMultiEntity: entityCount > 1,
entityCount: Math.max(entityCount, 1)
};
} catch {
return { isMultiEntity: false, entityCount: 1 };
}
}
/**
* Generate edge case warnings based on extraction results and session context.
* Edge cases: EIN mismatch, tax year mismatch, zero-extraction, multi-entity.
*/
private async generateWarnings(
sessionId: string,
extractionResult: K1ExtractionResult,
partnershipId: string,
buffer: Buffer
): Promise<string[]> {
const warnings: string[] = [];
// Edge Case 5: Multi-entity PDF detection
const multiEntity = await this.detectMultiEntityPdf(buffer);
if (multiEntity.isMultiEntity) {
warnings.push(
`This PDF appears to contain ${multiEntity.entityCount} K-1 forms for different entities. ` +
'Only the first entity will be processed. Upload separate PDFs for each entity.'
);
}
// Edge Case 3: Zero-extraction warning
const nonZeroFields = extractionResult.fields.filter(
(f) => f.numericValue !== null && f.numericValue !== 0
);
if (nonZeroFields.length === 0) {
warnings.push(
'All extracted values are zero or empty. The PDF may not be readable or may not contain K-1 data. ' +
'Please verify the PDF quality and try again.'
);
}
// Edge Case 4: EIN mismatch with existing partnership
const session = await this.prismaService.k1ImportSession.findUnique({
where: { id: sessionId }
});
if (session) {
const partnership = await this.prismaService.partnership.findUnique({
where: { id: partnershipId }
});
if (partnership && (partnership as any).ein) {
const extractedEin = extractionResult.fields.find(
(f) =>
f.label?.toLowerCase().includes('ein') ||
f.boxNumber?.toLowerCase() === 'ein'
);
if (
extractedEin?.rawValue &&
extractedEin.rawValue !== (partnership as any).ein
) {
warnings.push(
`Extracted EIN (${extractedEin.rawValue}) does not match partnership EIN (${(partnership as any).ein}). ` +
'Verify you uploaded the correct K-1 for this partnership.'
);
}
}
// Edge Case 6: Tax year mismatch
const extractedYear = extractionResult.fields.find(
(f) =>
f.label?.toLowerCase().includes('tax year') ||
f.label?.toLowerCase().includes('calendar year') ||
f.boxNumber?.toLowerCase() === 'taxyear'
);
if (extractedYear?.rawValue) {
const parsedYear = parseInt(extractedYear.rawValue, 10);
if (!isNaN(parsedYear) && parsedYear !== session.taxYear) {
warnings.push(
`Extracted tax year (${parsedYear}) does not match expected tax year (${session.taxYear}). ` +
'You can override the tax year during verification if needed.'
);
}
}
}
return warnings;
}
}

17
apps/api/src/app/upload/upload.service.ts

@ -4,8 +4,9 @@ import { HttpException, Injectable } from '@nestjs/common';
import { DocumentType } from '@prisma/client';
import { StatusCodes, getReasonPhrase } from 'http-status-codes';
import { createReadStream, existsSync } from 'node:fs';
import { mkdir } from 'node:fs/promises';
import { mkdir, writeFile } from 'node:fs/promises';
import { join } from 'node:path';
import { v4 as uuidv4 } from 'uuid';
@Injectable()
export class UploadService {
@ -51,7 +52,19 @@ export class UploadService {
await mkdir(subDir, { recursive: true });
}
const relativePath = `${yearDir}/${monthDir}/${file.filename}`;
// Support both disk storage (file.filename set by multer) and memory storage (file.buffer)
let filename = file.filename;
if (!filename) {
const ext = (file.originalname || 'file').split('.').pop();
filename = `${uuidv4()}.${ext}`;
if (file.buffer) {
await writeFile(join(subDir, filename), file.buffer);
}
}
const relativePath = `/${yearDir}/${monthDir}/${filename}`;
return this.prismaService.document.create({
data: {

2
apps/api/src/environments/environment.ts

@ -2,6 +2,6 @@ import { DEFAULT_HOST } from '@ghostfolio/common/config';
export const environment = {
production: false,
rootUrl: `https://${DEFAULT_HOST}:4200`,
rootUrl: `http://${DEFAULT_HOST}:4200`,
version: 'dev'
};

2
apps/api/src/services/configuration/configuration.service.ts

@ -23,6 +23,8 @@ export class ConfigurationService {
this.environmentConfiguration = cleanEnv(process.env, {
ACCESS_TOKEN_SALT: str(),
API_KEY_ALPHA_VANTAGE: str({ default: '' }),
AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT: str({ default: '' }),
AZURE_DOCUMENT_INTELLIGENCE_KEY: str({ default: '' }),
API_KEY_BETTER_UPTIME: str({ default: '' }),
API_KEY_COINGECKO_DEMO: str({ default: '' }),
API_KEY_COINGECKO_PRO: str({ default: '' }),

2
apps/api/src/services/interfaces/environment.interface.ts

@ -3,6 +3,8 @@ import { CleanedEnvAccessors } from 'envalid';
export interface Environment extends CleanedEnvAccessors {
ACCESS_TOKEN_SALT: string;
API_KEY_ALPHA_VANTAGE: string;
AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT: string;
AZURE_DOCUMENT_INTELLIGENCE_KEY: string;
API_KEY_BETTER_UPTIME: string;
API_KEY_COINGECKO_DEMO: string;
API_KEY_COINGECKO_PRO: string;

5
apps/client/project.json

@ -215,10 +215,7 @@
"executor": "@nx/angular:dev-server",
"options": {
"buildTarget": "client:build",
"proxyConfig": "apps/client/proxy.conf.json",
"ssl": true,
"sslCert": "apps/client/localhost.cert",
"sslKey": "apps/client/localhost.pem"
"proxyConfig": "apps/client/proxy.conf.json"
},
"configurations": {
"development-ca": {

14
apps/client/src/app/app.routes.ts

@ -176,6 +176,13 @@ export const routes: Routes = [
(m) => m.routes
)
},
{
path: 'cell-mapping',
loadChildren: () =>
import('./pages/cell-mapping/cell-mapping-page.routes').then(
(m) => m.routes
)
},
{
path: 'k-documents',
loadChildren: () =>
@ -183,6 +190,13 @@ export const routes: Routes = [
(m) => m.routes
)
},
{
path: 'k1-import',
loadChildren: () =>
import('./pages/k1-import/k1-import-page.routes').then(
(m) => m.routes
)
},
{
path: 'reports',
loadChildren: () =>

42
apps/client/src/app/components/header/header.component.html

@ -110,6 +110,32 @@
>K-1 Documents</a
>
</li>
<li class="list-inline-item">
<a
class="d-none d-sm-block"
i18n
mat-flat-button
routerLink="/k1-import"
[ngClass]="{
'font-weight-bold': currentRoute === 'k1-import',
'text-decoration-underline': currentRoute === 'k1-import'
}"
>K-1 Import</a
>
</li>
<li class="list-inline-item">
<a
class="d-none d-sm-block"
i18n
mat-flat-button
routerLink="/cell-mapping"
[ngClass]="{
'font-weight-bold': currentRoute === 'cell-mapping',
'text-decoration-underline': currentRoute === 'cell-mapping'
}"
>Cell Mapping</a
>
</li>
<li class="list-inline-item">
<a
class="d-none d-sm-block"
@ -364,6 +390,22 @@
[ngClass]="{ 'font-weight-bold': currentRoute === 'k-documents' }"
>K-1 Documents</a
>
<a
class="d-flex d-sm-none"
i18n
mat-menu-item
routerLink="/k1-import"
[ngClass]="{ 'font-weight-bold': currentRoute === 'k1-import' }"
>K-1 Import</a
>
<a
class="d-flex d-sm-none"
i18n
mat-menu-item
routerLink="/cell-mapping"
[ngClass]="{ 'font-weight-bold': currentRoute === 'cell-mapping' }"
>Cell Mapping</a
>
<a
class="d-flex d-sm-none"
i18n

372
apps/client/src/app/pages/cell-mapping/cell-mapping-page.component.ts

@ -0,0 +1,372 @@
import { K1ImportDataService } from '@ghostfolio/client/services/k1-import-data.service';
import { FamilyOfficeDataService } from '@ghostfolio/client/services/family-office-data.service';
import { CommonModule } from '@angular/common';
import {
ChangeDetectionStrategy,
ChangeDetectorRef,
Component,
DestroyRef,
OnInit
} from '@angular/core';
import { takeUntilDestroyed } from '@angular/core/rxjs-interop';
import { FormsModule } from '@angular/forms';
import { MatButtonModule } from '@angular/material/button';
import { MatCheckboxModule } from '@angular/material/checkbox';
import { MatFormFieldModule } from '@angular/material/form-field';
import { MatIconModule } from '@angular/material/icon';
import { MatInputModule } from '@angular/material/input';
import { MatSelectModule } from '@angular/material/select';
import { MatTableModule } from '@angular/material/table';
import { MatTooltipModule } from '@angular/material/tooltip';
interface EditableMapping {
boxNumber: string;
label: string;
description: string;
cellType: string;
isCustom: boolean;
isIgnored: boolean;
isEditing: boolean;
editLabel: string;
editDescription: string;
editCellType: string;
}
interface EditableRule {
name: string;
operation: string;
sourceCells: string[];
isEditing: boolean;
editName: string;
editSourceCells: string;
}
@Component({
changeDetection: ChangeDetectionStrategy.OnPush,
host: { class: 'page' },
imports: [
CommonModule,
FormsModule,
MatButtonModule,
MatCheckboxModule,
MatFormFieldModule,
MatIconModule,
MatInputModule,
MatSelectModule,
MatTableModule,
MatTooltipModule
],
selector: 'gf-cell-mapping-page',
styleUrls: ['./cell-mapping-page.scss'],
templateUrl: './cell-mapping-page.html'
})
export class CellMappingPageComponent implements OnInit {
public aggregationRules: EditableRule[] = [];
public error: string | null = null;
public isSaving = false;
public mappings: EditableMapping[] = [];
public partnerships: Array<{ id: string; name: string }> = [];
public selectedPartnershipId = '';
public successMessage: string | null = null;
// New custom cell form
public newBoxNumber = '';
public newCellType = 'number';
public newLabel = '';
// New rule form
public newRuleName = '';
public newRuleSourceCells = '';
public cellTypeOptions = [
{ value: 'number', label: 'Number ($)' },
{ value: 'string', label: 'String' },
{ value: 'percentage', label: 'Percentage (%)' },
{ value: 'boolean', label: 'Boolean' }
];
public displayedColumns = ['boxNumber', 'label', 'description', 'cellType', 'isCustom', 'isIgnored', 'actions'];
public constructor(
private readonly changeDetectorRef: ChangeDetectorRef,
private readonly destroyRef: DestroyRef,
private readonly familyOfficeDataService: FamilyOfficeDataService,
private readonly k1ImportDataService: K1ImportDataService
) {}
public ngOnInit(): void {
this.fetchPartnerships();
}
public onPartnershipChange(): void {
if (this.selectedPartnershipId) {
this.loadMappings();
this.loadAggregationRules();
}
}
// ── Cell Mapping Methods ─────────────────────────────────────────
public startEditMapping(mapping: EditableMapping): void {
mapping.isEditing = true;
mapping.editLabel = mapping.label;
mapping.editDescription = mapping.description;
mapping.editCellType = mapping.cellType;
this.changeDetectorRef.markForCheck();
}
public saveEditMapping(mapping: EditableMapping): void {
mapping.label = mapping.editLabel;
mapping.description = mapping.editDescription;
mapping.cellType = mapping.editCellType;
mapping.isEditing = false;
this.changeDetectorRef.markForCheck();
}
public cancelEditMapping(mapping: EditableMapping): void {
mapping.isEditing = false;
this.changeDetectorRef.markForCheck();
}
public toggleIgnored(mapping: EditableMapping): void {
if (!this.selectedPartnershipId) {
return;
}
this.k1ImportDataService
.toggleFieldIgnored({
partnershipId: this.selectedPartnershipId,
boxNumber: mapping.boxNumber
})
.pipe(takeUntilDestroyed(this.destroyRef))
.subscribe({
next: (result: any) => {
mapping.isIgnored = result.isIgnored;
this.changeDetectorRef.markForCheck();
},
error: (err) => {
this.error =
err?.error?.message || 'Failed to toggle ignored state.';
this.changeDetectorRef.markForCheck();
}
});
}
public addCustomCell(): void {
if (!this.newBoxNumber || !this.newLabel) {
return;
}
this.mappings.push({
boxNumber: this.newBoxNumber,
label: this.newLabel,
description: '',
cellType: this.newCellType,
isCustom: true,
isIgnored: false,
isEditing: false,
editLabel: '',
editDescription: '',
editCellType: this.newCellType
});
this.newBoxNumber = '';
this.newLabel = '';
this.newCellType = 'number';
this.changeDetectorRef.markForCheck();
}
public removeMapping(index: number): void {
this.mappings.splice(index, 1);
this.changeDetectorRef.markForCheck();
}
public saveMappings(): void {
if (!this.selectedPartnershipId) {
return;
}
this.isSaving = true;
this.error = null;
this.successMessage = null;
this.changeDetectorRef.markForCheck();
this.k1ImportDataService
.updateCellMappings({
partnershipId: this.selectedPartnershipId,
mappings: this.mappings.map((m) => ({
boxNumber: m.boxNumber,
label: m.label,
description: m.description,
cellType: m.cellType,
isCustom: m.isCustom
}))
})
.pipe(takeUntilDestroyed(this.destroyRef))
.subscribe({
next: () => {
this.isSaving = false;
this.successMessage = 'Cell mappings saved successfully.';
this.changeDetectorRef.markForCheck();
},
error: (err) => {
this.isSaving = false;
this.error =
err?.error?.message || err?.message || 'Failed to save mappings.';
this.changeDetectorRef.markForCheck();
}
});
}
public resetToDefaults(): void {
if (!this.selectedPartnershipId) {
return;
}
this.k1ImportDataService
.resetCellMappings(this.selectedPartnershipId)
.pipe(takeUntilDestroyed(this.destroyRef))
.subscribe({
next: () => {
this.successMessage = 'Cell mappings reset to IRS defaults.';
this.loadMappings();
},
error: (err) => {
this.error =
err?.error?.message || err?.message || 'Failed to reset mappings.';
this.changeDetectorRef.markForCheck();
}
});
}
// ── Aggregation Rule Methods ─────────────────────────────────────
public addAggregationRule(): void {
if (!this.newRuleName || !this.newRuleSourceCells) {
return;
}
this.aggregationRules.push({
name: this.newRuleName,
operation: 'SUM',
sourceCells: this.newRuleSourceCells.split(',').map((s) => s.trim()),
isEditing: false,
editName: '',
editSourceCells: ''
});
this.newRuleName = '';
this.newRuleSourceCells = '';
this.changeDetectorRef.markForCheck();
}
public removeAggregationRule(index: number): void {
this.aggregationRules.splice(index, 1);
this.changeDetectorRef.markForCheck();
}
public saveAggregationRules(): void {
if (!this.selectedPartnershipId) {
return;
}
this.isSaving = true;
this.error = null;
this.successMessage = null;
this.changeDetectorRef.markForCheck();
this.k1ImportDataService
.updateAggregationRules({
partnershipId: this.selectedPartnershipId,
rules: this.aggregationRules.map((r) => ({
name: r.name,
operation: r.operation,
sourceCells: r.sourceCells
}))
})
.pipe(takeUntilDestroyed(this.destroyRef))
.subscribe({
next: () => {
this.isSaving = false;
this.successMessage = 'Aggregation rules saved successfully.';
this.changeDetectorRef.markForCheck();
},
error: (err) => {
this.isSaving = false;
this.error =
err?.error?.message || err?.message || 'Failed to save rules.';
this.changeDetectorRef.markForCheck();
}
});
}
// ── Data Loading ─────────────────────────────────────────────────
private fetchPartnerships(): void {
this.familyOfficeDataService
.fetchPartnerships()
.pipe(takeUntilDestroyed(this.destroyRef))
.subscribe({
next: (partnerships) => {
this.partnerships = partnerships.map((p) => ({
id: p.id,
name: p.name
}));
this.changeDetectorRef.markForCheck();
}
});
}
private loadMappings(): void {
this.k1ImportDataService
.fetchCellMappings(this.selectedPartnershipId)
.pipe(takeUntilDestroyed(this.destroyRef))
.subscribe({
next: (mappings: any[]) => {
this.mappings = mappings.map((m) => ({
boxNumber: m.boxNumber,
label: m.label,
description: m.description || '',
cellType: m.cellType || 'number',
isCustom: m.isCustom,
isIgnored: m.isIgnored ?? false,
isEditing: false,
editLabel: '',
editDescription: '',
editCellType: m.cellType || 'number'
}));
this.changeDetectorRef.markForCheck();
},
error: (err) => {
this.error =
err?.error?.message || 'Failed to load cell mappings.';
this.changeDetectorRef.markForCheck();
}
});
}
private loadAggregationRules(): void {
this.k1ImportDataService
.fetchAggregationRules(this.selectedPartnershipId)
.pipe(takeUntilDestroyed(this.destroyRef))
.subscribe({
next: (rules: any[]) => {
this.aggregationRules = rules.map((r) => ({
name: r.name,
operation: r.operation,
sourceCells: (r.sourceCells as string[]) || [],
isEditing: false,
editName: '',
editSourceCells: ''
}));
this.changeDetectorRef.markForCheck();
},
error: (err) => {
this.error =
err?.error?.message || 'Failed to load aggregation rules.';
this.changeDetectorRef.markForCheck();
}
});
}
}

207
apps/client/src/app/pages/cell-mapping/cell-mapping-page.html

@ -0,0 +1,207 @@
<div class="container">
<h1>Cell Mapping Configuration</h1>
@if (error) {
<div class="alert alert-error">{{ error }}</div>
}
@if (successMessage) {
<div class="alert alert-success">{{ successMessage }}</div>
}
<!-- Partnership Selector -->
<section class="partnership-selector">
<mat-form-field appearance="outline">
<mat-label>Partnership</mat-label>
<mat-select [(ngModel)]="selectedPartnershipId" (selectionChange)="onPartnershipChange()">
@for (p of partnerships; track p.id) {
<mat-option [value]="p.id">{{ p.name }}</mat-option>
}
</mat-select>
</mat-form-field>
</section>
@if (selectedPartnershipId) {
<!-- Cell Mappings -->
<section class="cell-mappings">
<h2>Cell Mappings</h2>
<table mat-table [dataSource]="mappings" class="mappings-table">
<!-- Box Number -->
<ng-container matColumnDef="boxNumber">
<th mat-header-cell *matHeaderCellDef>Box #</th>
<td mat-cell *matCellDef="let row">{{ row.boxNumber }}</td>
</ng-container>
<!-- Label -->
<ng-container matColumnDef="label">
<th mat-header-cell *matHeaderCellDef>Label</th>
<td mat-cell *matCellDef="let row">
@if (row.isEditing) {
<input class="cell-input" [(ngModel)]="row.editLabel" />
} @else {
{{ row.label }}
}
</td>
</ng-container>
<!-- Description -->
<ng-container matColumnDef="description">
<th mat-header-cell *matHeaderCellDef>Description</th>
<td mat-cell *matCellDef="let row">
@if (row.isEditing) {
<input class="cell-input" [(ngModel)]="row.editDescription" />
} @else {
{{ row.description }}
}
</td>
</ng-container>
<!-- Is Custom -->
<ng-container matColumnDef="isCustom">
<th mat-header-cell *matHeaderCellDef>Custom</th>
<td mat-cell *matCellDef="let row">
@if (row.isCustom) {
<mat-icon class="custom-badge" matTooltip="Partnership-specific override">star</mat-icon>
}
</td>
</ng-container>
<!-- Cell Type -->
<ng-container matColumnDef="cellType">
<th mat-header-cell *matHeaderCellDef>Type</th>
<td mat-cell *matCellDef="let row">
@if (row.isEditing) {
<mat-select class="type-select" [(ngModel)]="row.editCellType">
@for (opt of cellTypeOptions; track opt.value) {
<mat-option [value]="opt.value">{{ opt.label }}</mat-option>
}
</mat-select>
} @else {
<span class="type-badge type-{{ row.cellType }}">{{ row.cellType }}</span>
}
</td>
</ng-container>
<!-- Is Ignored -->
<ng-container matColumnDef="isIgnored">
<th mat-header-cell *matHeaderCellDef>Ignored</th>
<td mat-cell *matCellDef="let row">
<mat-checkbox
[checked]="row.isIgnored"
(change)="toggleIgnored(row)"
matTooltip="Ignored fields are excluded from scan results">
</mat-checkbox>
</td>
</ng-container>
<!-- Actions -->
<ng-container matColumnDef="actions">
<th mat-header-cell *matHeaderCellDef>Actions</th>
<td mat-cell *matCellDef="let row; let i = index">
@if (row.isEditing) {
<button mat-icon-button (click)="saveEditMapping(row)" matTooltip="Save">
<mat-icon>check</mat-icon>
</button>
<button mat-icon-button (click)="cancelEditMapping(row)" matTooltip="Cancel">
<mat-icon>close</mat-icon>
</button>
} @else {
<button mat-icon-button (click)="startEditMapping(row)" matTooltip="Edit">
<mat-icon>edit</mat-icon>
</button>
@if (row.isCustom) {
<button mat-icon-button (click)="removeMapping(i)" matTooltip="Remove">
<mat-icon>delete</mat-icon>
</button>
}
}
</td>
</ng-container>
<tr mat-header-row *matHeaderRowDef="displayedColumns"></tr>
<tr mat-row *matRowDef="let row; columns: displayedColumns;"></tr>
</table>
<!-- Add Custom Cell -->
<div class="add-row">
<mat-form-field appearance="outline">
<mat-label>Box #</mat-label>
<input matInput [(ngModel)]="newBoxNumber" placeholder="e.g. 20c" />
</mat-form-field>
<mat-form-field appearance="outline">
<mat-label>Label</mat-label>
<input matInput [(ngModel)]="newLabel" placeholder="e.g. Other deductions" />
</mat-form-field>
<mat-form-field appearance="outline">
<mat-label>Type</mat-label>
<mat-select [(ngModel)]="newCellType">
@for (opt of cellTypeOptions; track opt.value) {
<mat-option [value]="opt.value">{{ opt.label }}</mat-option>
}
</mat-select>
</mat-form-field>
<button mat-stroked-button (click)="addCustomCell()" [disabled]="!newBoxNumber || !newLabel">
<mat-icon>add</mat-icon> Add Custom Cell
</button>
</div>
<!-- Mapping Actions -->
<div class="mapping-actions">
<button mat-flat-button color="primary" (click)="saveMappings()" [disabled]="isSaving">
Save Mappings
</button>
<button mat-stroked-button color="warn" (click)="resetToDefaults()">
Reset to IRS Defaults
</button>
</div>
</section>
<!-- Aggregation Rules -->
<section class="aggregation-rules">
<h2>Aggregation Rules</h2>
@if (aggregationRules.length === 0) {
<p class="empty-state">No aggregation rules configured.</p>
}
@for (rule of aggregationRules; track rule.name; let i = $index) {
<div class="rule-card">
<div class="rule-header">
<strong>{{ rule.name }}</strong>
<span class="rule-operation">{{ rule.operation }}</span>
<button mat-icon-button (click)="removeAggregationRule(i)" matTooltip="Remove rule">
<mat-icon>delete</mat-icon>
</button>
</div>
<div class="rule-source-cells">
Source cells:
@for (cell of rule.sourceCells; track cell) {
<span class="cell-chip">{{ cell }}</span>
}
</div>
</div>
}
<!-- Add Aggregation Rule -->
<div class="add-rule-row">
<mat-form-field appearance="outline">
<mat-label>Rule Name</mat-label>
<input matInput [(ngModel)]="newRuleName" placeholder="e.g. Total Income" />
</mat-form-field>
<mat-form-field appearance="outline">
<mat-label>Source Cells (comma-separated)</mat-label>
<input matInput [(ngModel)]="newRuleSourceCells" placeholder="e.g. 1, 2, 3, 4a" />
</mat-form-field>
<button mat-stroked-button (click)="addAggregationRule()" [disabled]="!newRuleName || !newRuleSourceCells">
<mat-icon>add</mat-icon> Add Rule
</button>
</div>
<div class="rule-actions">
<button mat-flat-button color="primary" (click)="saveAggregationRules()" [disabled]="isSaving">
Save Rules
</button>
</div>
</section>
}
</div>

15
apps/client/src/app/pages/cell-mapping/cell-mapping-page.routes.ts

@ -0,0 +1,15 @@
import { AuthGuard } from '@ghostfolio/client/core/auth.guard';
import { Routes } from '@angular/router';
export const routes: Routes = [
{
canActivate: [AuthGuard],
loadComponent: () =>
import('./cell-mapping-page.component').then(
(c) => c.CellMappingPageComponent
),
path: '',
title: 'Cell Mapping'
}
];

258
apps/client/src/app/pages/cell-mapping/cell-mapping-page.scss

@ -0,0 +1,258 @@
:host {
display: block;
}
.container {
max-width: 1400px;
margin: 0 auto;
padding: 1.5rem;
}
h1 {
margin-bottom: 1.5rem;
}
h2 {
margin-bottom: 1rem;
font-size: 1.25rem;
}
// Alerts
.alert {
padding: 0.75rem 1rem;
border-radius: 4px;
margin-bottom: 1rem;
font-size: 0.875rem;
}
.alert-error {
background-color: #fdecea;
color: #b71c1c;
}
.alert-success {
background-color: #e8f5e9;
color: #2e7d32;
}
// Partnership Selector
.partnership-selector {
margin-bottom: 1.5rem;
mat-form-field {
width: 100%;
max-width: 400px;
}
}
// Cell Mappings
.cell-mappings {
margin-bottom: 2rem;
}
.mappings-table {
width: 100%;
margin-bottom: 1rem;
// Let browser auto-size columns based on content
th.mat-mdc-header-cell,
td.mat-mdc-cell {
padding: 8px 12px;
vertical-align: middle;
}
.mat-column-boxNumber {
white-space: nowrap;
font-family: 'Roboto Mono', monospace;
font-size: 0.8125rem;
color: rgba(0, 0, 0, 0.72);
width: 1%; // shrink-to-fit trick for auto layout
}
.mat-column-label {
white-space: nowrap;
}
.mat-column-description {
color: rgba(0, 0, 0, 0.6);
font-size: 0.8125rem;
// Allow wrapping for long descriptions
word-break: break-word;
}
.mat-column-cellType {
width: 1%;
white-space: nowrap;
}
.mat-column-isCustom {
width: 1%;
white-space: nowrap;
text-align: center;
}
.mat-column-isIgnored {
width: 1%;
white-space: nowrap;
text-align: center;
}
.mat-column-actions {
width: 1%;
white-space: nowrap;
}
}
// Lightweight inline cell inputs (no mat-form-field wrapper)
.cell-input {
width: 100%;
min-width: 160px;
box-sizing: border-box;
padding: 6px 8px;
font-size: 0.8125rem;
font-family: inherit;
border: 1px solid rgba(0, 0, 0, 0.24);
border-radius: 4px;
background: transparent;
outline: none;
transition: border-color 0.15s ease;
&:focus {
border-color: var(--primary-color, #1976d2);
box-shadow: 0 0 0 1px var(--primary-color, #1976d2);
}
}
.custom-badge {
color: #f9a825;
font-size: 20px;
}
// Type badge styling
.type-badge {
display: inline-block;
font-size: 0.75rem;
font-weight: 500;
padding: 2px 8px;
border-radius: 10px;
text-transform: capitalize;
white-space: nowrap;
}
.type-number {
background-color: #e3f2fd;
color: #1565c0;
}
.type-string {
background-color: #f3e5f5;
color: #7b1fa2;
}
.type-percentage {
background-color: #e8f5e9;
color: #2e7d32;
}
.type-boolean {
background-color: #fff3e0;
color: #e65100;
}
// Inline type selector (no mat-form-field wrapper)
.type-select {
min-width: 110px;
}
.add-row {
display: flex;
align-items: center;
gap: 1rem;
margin-bottom: 1rem;
flex-wrap: wrap;
mat-form-field {
flex: 0 0 auto;
width: 160px;
}
}
.mapping-actions {
display: flex;
gap: 1rem;
margin-top: 0.5rem;
}
// Aggregation Rules
.aggregation-rules {
margin-bottom: 2rem;
}
.empty-state {
color: rgba(0, 0, 0, 0.54);
font-style: italic;
margin-bottom: 1rem;
}
.rule-card {
border: 1px solid rgba(0, 0, 0, 0.12);
border-radius: 4px;
padding: 0.75rem 1rem;
margin-bottom: 0.75rem;
}
.rule-header {
display: flex;
align-items: center;
gap: 0.75rem;
strong {
flex: 1;
}
.rule-operation {
font-family: monospace;
font-size: 0.8rem;
background-color: #e8eaf6;
color: #283593;
padding: 2px 8px;
border-radius: 4px;
}
}
.rule-source-cells {
margin-top: 0.5rem;
font-size: 0.875rem;
color: rgba(0, 0, 0, 0.7);
}
.cell-chip {
display: inline-block;
font-family: monospace;
font-size: 0.8rem;
background-color: #f5f5f5;
border: 1px solid rgba(0, 0, 0, 0.12);
border-radius: 12px;
padding: 2px 8px;
margin-left: 4px;
}
.add-rule-row {
display: flex;
align-items: center;
gap: 1rem;
margin-bottom: 1rem;
flex-wrap: wrap;
mat-form-field {
flex: 1 1 200px;
}
}
.rule-actions {
margin-top: 0.5rem;
}

134
apps/client/src/app/pages/k-documents/k-document-detail/k-document-detail.component.ts

@ -0,0 +1,134 @@
import { FamilyOfficeDataService } from '@ghostfolio/client/services/family-office-data.service';
import { K1ImportDataService } from '@ghostfolio/client/services/k1-import-data.service';
import { K1AggregationResult } from '@ghostfolio/common/interfaces/k1-import.interface';
import { CommonModule } from '@angular/common';
import {
ChangeDetectionStrategy,
ChangeDetectorRef,
Component,
DestroyRef,
OnInit
} from '@angular/core';
import { takeUntilDestroyed } from '@angular/core/rxjs-interop';
import { MatButtonModule } from '@angular/material/button';
import { MatCardModule } from '@angular/material/card';
import { MatChipsModule } from '@angular/material/chips';
import { MatIconModule } from '@angular/material/icon';
import { MatTableModule } from '@angular/material/table';
import { ActivatedRoute, Router, RouterModule } from '@angular/router';
@Component({
changeDetection: ChangeDetectionStrategy.OnPush,
host: { class: 'page' },
imports: [
CommonModule,
MatButtonModule,
MatCardModule,
MatChipsModule,
MatIconModule,
MatTableModule,
RouterModule
],
selector: 'gf-k-document-detail',
styleUrls: ['./k-document-detail.scss'],
templateUrl: './k-document-detail.html'
})
export class KDocumentDetailComponent implements OnInit {
public aggregations: K1AggregationResult[] = [];
public boxColumns = ['boxNumber', 'value'];
public boxData: Array<{ boxNumber: string; value: number | string | null }> = [];
public error: string | null = null;
public kDocument: any = null;
public kDocumentId: string;
/** Box numbers that represent percentage values (Section J) */
private static readonly PERCENTAGE_BOXES = new Set([
'J_PROFIT_BEGIN', 'J_PROFIT_END',
'J_LOSS_BEGIN', 'J_LOSS_END',
'J_CAPITAL_BEGIN', 'J_CAPITAL_END'
]);
public isPercentage(boxNumber: string): boolean {
return KDocumentDetailComponent.PERCENTAGE_BOXES.has(boxNumber);
}
public isNumeric(value: any): boolean {
return typeof value === 'number';
}
public constructor(
private readonly activatedRoute: ActivatedRoute,
private readonly changeDetectorRef: ChangeDetectorRef,
private readonly destroyRef: DestroyRef,
private readonly familyOfficeDataService: FamilyOfficeDataService,
private readonly k1ImportDataService: K1ImportDataService,
private readonly router: Router
) {}
public ngOnInit(): void {
this.kDocumentId = this.activatedRoute.snapshot.paramMap.get('id') || '';
if (this.kDocumentId) {
this.loadKDocument();
this.loadAggregations();
}
}
public goBack(): void {
this.router.navigate(['/k-documents']);
}
private loadKDocument(): void {
this.familyOfficeDataService
.fetchKDocuments()
.pipe(takeUntilDestroyed(this.destroyRef))
.subscribe({
next: (docs) => {
this.kDocument = docs.find((d) => d.id === this.kDocumentId) || null;
if (this.kDocument?.data) {
const data = this.kDocument.data as Record<string, any>;
this.boxData = Object.entries(data)
.map(([boxNumber, value]) => ({
boxNumber,
value: value ?? null
}))
.sort((a, b) => this.compareBoxNumbers(a.boxNumber, b.boxNumber));
}
this.changeDetectorRef.markForCheck();
},
error: () => {
this.error = 'Failed to load K-Document.';
this.changeDetectorRef.markForCheck();
}
});
}
private loadAggregations(): void {
this.k1ImportDataService
.computeAggregations({ kDocumentId: this.kDocumentId })
.pipe(takeUntilDestroyed(this.destroyRef))
.subscribe({
next: (aggregations) => {
this.aggregations = aggregations;
this.changeDetectorRef.markForCheck();
},
error: () => {
// Aggregations may not be configured yet
this.aggregations = [];
this.changeDetectorRef.markForCheck();
}
});
}
private compareBoxNumbers(a: string, b: string): number {
const numA = parseInt(a.replace(/[^0-9]/g, ''), 10) || 0;
const numB = parseInt(b.replace(/[^0-9]/g, ''), 10) || 0;
if (numA !== numB) {
return numA - numB;
}
return a.localeCompare(b);
}
}

106
apps/client/src/app/pages/k-documents/k-document-detail/k-document-detail.html

@ -0,0 +1,106 @@
<div class="container">
<div class="d-flex align-items-center mb-3">
<button mat-icon-button (click)="goBack()">
<mat-icon>arrow_back</mat-icon>
</button>
<h1 class="h3 mb-0 ms-2">K-Document Detail</h1>
</div>
@if (error) {
<div class="alert alert-danger">{{ error }}</div>
}
@if (kDocument) {
<!-- Summary Card -->
<mat-card class="mb-4">
<mat-card-header>
<mat-card-title>{{ kDocument.partnershipName || kDocument.partnershipId }}</mat-card-title>
<mat-card-subtitle>
{{ kDocument.type }} — Tax Year {{ kDocument.taxYear }}
</mat-card-subtitle>
</mat-card-header>
<mat-card-content>
<div class="detail-row">
<span class="label">Filing Status:</span>
<mat-chip-set>
<mat-chip [class.chip-draft]="kDocument.filingStatus === 'DRAFT'"
[class.chip-estimated]="kDocument.filingStatus === 'ESTIMATED'"
[class.chip-final]="kDocument.filingStatus === 'FINAL'">
{{ kDocument.filingStatus }}
</mat-chip>
</mat-chip-set>
</div>
<div class="detail-row">
<span class="label">Created:</span>
<span>{{ kDocument.createdAt | date:'medium' }}</span>
</div>
<div class="detail-row">
<span class="label">Updated:</span>
<span>{{ kDocument.updatedAt | date:'medium' }}</span>
</div>
</mat-card-content>
</mat-card>
<!-- Aggregation Summary (FR-036) -->
@if (aggregations.length > 0) {
<h2 class="h5 mb-3">Aggregation Summary</h2>
<div class="aggregation-cards mb-4">
@for (agg of aggregations; track agg.name) {
<mat-card class="aggregation-card">
<mat-card-header>
<mat-card-title>{{ agg.name }}</mat-card-title>
</mat-card-header>
<mat-card-content>
<div class="aggregation-value">
{{ agg.value | currency:'USD':'symbol':'1.2-6' }}
</div>
@if (agg.breakdown && agg.breakdown.length > 0) {
<div class="breakdown">
@for (item of agg.breakdown; track item.boxNumber) {
<div class="breakdown-row">
<span class="box-label">Box {{ item.boxNumber }}:</span>
<span class="box-value">{{ item.value | currency:'USD':'symbol':'1.2-6' }}</span>
</div>
}
</div>
}
</mat-card-content>
</mat-card>
}
</div>
}
<!-- Raw Box Values -->
<h2 class="h5 mb-3">Box Values</h2>
@if (boxData.length > 0) {
<table mat-table [dataSource]="boxData" class="w-100 box-table">
<ng-container matColumnDef="boxNumber">
<th mat-header-cell *matHeaderCellDef>Box #</th>
<td mat-cell *matCellDef="let row">{{ row.boxNumber }}</td>
</ng-container>
<ng-container matColumnDef="value">
<th mat-header-cell *matHeaderCellDef>Value</th>
<td mat-cell *matCellDef="let row">
@if (row.value === null || row.value === '') {
<span class="text-muted"></span>
} @else if (isPercentage(row.boxNumber)) {
{{ row.value | number:'1.2-6' }}%
} @else if (isNumeric(row.value)) {
{{ row.value | currency:'USD':'symbol':'1.2-6' }}
} @else {
{{ row.value }}
}
</td>
</ng-container>
<tr mat-header-row *matHeaderRowDef="boxColumns"></tr>
<tr mat-row *matRowDef="let row; columns: boxColumns;"></tr>
</table>
} @else {
<p class="text-muted">No box values available.</p>
}
} @else if (!error) {
<p>Loading...</p>
}
</div>

79
apps/client/src/app/pages/k-documents/k-document-detail/k-document-detail.scss

@ -0,0 +1,79 @@
:host {
display: block;
}
.container {
max-width: 960px;
margin: 0 auto;
padding: 1.5rem;
}
.detail-row {
display: flex;
align-items: center;
gap: 0.75rem;
margin-bottom: 0.5rem;
.label {
font-weight: 500;
min-width: 120px;
color: rgba(0, 0, 0, 0.6);
}
}
// Filing status chips
.chip-draft {
--mdc-chip-elevated-container-color: #e0e0e0;
}
.chip-estimated {
--mdc-chip-elevated-container-color: #fff3e0;
}
.chip-final {
--mdc-chip-elevated-container-color: #e8f5e9;
}
// Aggregation cards
.aggregation-cards {
display: grid;
grid-template-columns: repeat(auto-fill, minmax(280px, 1fr));
gap: 1rem;
}
.aggregation-card {
.aggregation-value {
font-size: 1.5rem;
font-weight: 600;
margin-bottom: 0.5rem;
}
.breakdown {
border-top: 1px solid rgba(0, 0, 0, 0.08);
padding-top: 0.5rem;
font-size: 0.875rem;
}
.breakdown-row {
display: flex;
justify-content: space-between;
padding: 2px 0;
}
.box-label {
color: rgba(0, 0, 0, 0.6);
font-family: monospace;
}
.box-value {
font-weight: 500;
}
}
// Box table
.box-table {
max-width: 500px;
}

2
apps/client/src/app/pages/k-documents/k-documents-page.component.ts

@ -129,7 +129,7 @@ export class KDocumentsPageComponent implements OnInit {
public onFormSubmit(event: {
filingStatus: string;
data: Record<string, number>;
data: Record<string, number | string | null>;
}): void {
if (this.editingDoc) {
this.familyOfficeDataService

9
apps/client/src/app/pages/k-documents/k-documents-page.routes.ts

@ -10,5 +10,14 @@ export const routes: Routes = [
component: KDocumentsPageComponent,
path: '',
title: 'K-1 / K-3 Documents'
},
{
canActivate: [AuthGuard],
loadComponent: () =>
import('./k-document-detail/k-document-detail.component').then(
(c) => c.KDocumentDetailComponent
),
path: ':id',
title: 'K-Document Detail'
}
];

205
apps/client/src/app/pages/k1-import/k1-confirmation/k1-confirmation.component.ts

@ -0,0 +1,205 @@
import { K1ImportDataService } from '@ghostfolio/client/services/k1-import-data.service';
import { CommonModule } from '@angular/common';
import {
ChangeDetectionStrategy,
ChangeDetectorRef,
Component,
DestroyRef,
OnInit
} from '@angular/core';
import { takeUntilDestroyed } from '@angular/core/rxjs-interop';
import { FormsModule } from '@angular/forms';
import { MatButtonModule } from '@angular/material/button';
import { MatFormFieldModule } from '@angular/material/form-field';
import { MatProgressBarModule } from '@angular/material/progress-bar';
import { MatSelectModule } from '@angular/material/select';
import { MatTableModule } from '@angular/material/table';
import { ActivatedRoute, Router } from '@angular/router';
interface ConfirmationResult {
importSession: { id: string; status: string };
kDocument: {
id: string;
partnershipId: string;
type: string;
taxYear: number;
filingStatus: string;
data: Record<string, number | null>;
};
distributions: Array<{
id: string;
entityId: string;
type: string;
amount: number;
date: string;
}>;
allocations: Array<{
entityId: string;
entityName: string;
ownershipPercent: number;
allocatedValues: Record<string, number>;
}>;
document: { id: string; type: string; name: string } | null;
}
@Component({
changeDetection: ChangeDetectionStrategy.OnPush,
host: { class: 'page' },
imports: [
CommonModule,
FormsModule,
MatButtonModule,
MatFormFieldModule,
MatProgressBarModule,
MatSelectModule,
MatTableModule
],
selector: 'gf-k1-confirmation',
styleUrls: ['./k1-confirmation.scss'],
templateUrl: './k1-confirmation.html'
})
export class K1ConfirmationComponent implements OnInit {
public error: string | null = null;
public filingStatus: 'DRAFT' | 'ESTIMATED' | 'FINAL' = 'DRAFT';
public filingStatusOptions = ['DRAFT', 'ESTIMATED', 'FINAL'];
public existingKDocumentAction: 'UPDATE' | 'CREATE_NEW' | null = null;
public hasConflict = false;
public isConfirming = false;
public isLoading = true;
public result: ConfirmationResult | null = null;
public sessionId: string;
public sessionStatus: string;
public allocationColumns = [
'entityName',
'ownershipPercent',
'allocatedValues'
];
public distributionColumns = ['entityId', 'type', 'amount', 'date'];
public constructor(
private readonly activatedRoute: ActivatedRoute,
private readonly changeDetectorRef: ChangeDetectorRef,
private readonly destroyRef: DestroyRef,
private readonly k1ImportDataService: K1ImportDataService,
private readonly router: Router
) {}
public ngOnInit(): void {
this.sessionId = this.activatedRoute.snapshot.params['id'];
this.loadSession();
}
/**
* Confirm the verified K-1 data.
*/
public confirmImport(): void {
this.isConfirming = true;
this.error = null;
this.changeDetectorRef.markForCheck();
const data: any = {
filingStatus: this.filingStatus
};
if (this.existingKDocumentAction) {
data.existingKDocumentAction = this.existingKDocumentAction;
}
this.k1ImportDataService
.confirmImportSession(this.sessionId, data)
.pipe(takeUntilDestroyed(this.destroyRef))
.subscribe({
next: (res: ConfirmationResult) => {
this.result = res;
this.isConfirming = false;
this.changeDetectorRef.markForCheck();
},
error: (err) => {
this.isConfirming = false;
// Handle conflict (409) — existing KDocument
if (err?.status === 409) {
this.hasConflict = true;
this.error =
'A KDocument already exists for this partnership and tax year. Choose an action below.';
} else {
this.error =
err?.error?.message || err?.message || 'Confirmation failed.';
}
this.changeDetectorRef.markForCheck();
}
});
}
/**
* Navigate back to the K-1 import list.
*/
public goToImportList(): void {
this.router.navigate(['/k1-import']);
}
/**
* Navigate to the created KDocument detail.
*/
public viewKDocument(): void {
if (this.result?.kDocument?.id) {
this.router.navigate([
'/k-documents',
this.result.kDocument.id
]);
}
}
/**
* Cancel and go back to verification.
*/
public goBackToVerify(): void {
this.router.navigate(['/k1-import', this.sessionId, 'verify']);
}
private loadSession(): void {
this.k1ImportDataService
.fetchImportSession(this.sessionId)
.pipe(takeUntilDestroyed(this.destroyRef))
.subscribe({
next: (session: any) => {
this.sessionStatus = session.status;
if (session.status === 'CONFIRMED') {
// Already confirmed — show result view
this.result = {
importSession: { id: session.id, status: session.status },
kDocument: session.kDocumentId
? {
id: session.kDocumentId,
partnershipId: session.partnershipId,
type: 'K1',
taxYear: session.taxYear,
filingStatus: '',
data: {}
}
: null,
distributions: [],
allocations: [],
document: null
} as any;
} else if (session.status !== 'VERIFIED') {
this.error = `Session is in ${session.status} status. Must be VERIFIED to confirm.`;
}
this.isLoading = false;
this.changeDetectorRef.markForCheck();
},
error: (err) => {
this.error =
err?.error?.message || err?.message || 'Failed to load session.';
this.isLoading = false;
this.changeDetectorRef.markForCheck();
}
});
}
}

167
apps/client/src/app/pages/k1-import/k1-confirmation/k1-confirmation.html

@ -0,0 +1,167 @@
<div class="container">
<div class="row">
<div class="col">
@if (error) {
<div class="alert alert-danger mb-3">
{{ error }}
</div>
}
@if (isLoading) {
<mat-progress-bar mode="indeterminate"></mat-progress-bar>
} @else if (result) {
<!-- Confirmation Result -->
<h1 class="h3 mb-4 text-center text-success">K-1 Import Confirmed</h1>
<!-- KDocument Summary -->
<section class="mb-4">
<h2 class="h5 mb-3">KDocument Created</h2>
<div class="summary-card p-3">
<div class="d-flex justify-content-between">
<span>ID</span>
<strong>{{ result.kDocument.id }}</strong>
</div>
<div class="d-flex justify-content-between">
<span>Type</span>
<strong>{{ result.kDocument.type }}</strong>
</div>
<div class="d-flex justify-content-between">
<span>Tax Year</span>
<strong>{{ result.kDocument.taxYear }}</strong>
</div>
<div class="d-flex justify-content-between">
<span>Filing Status</span>
<strong>{{ result.kDocument.filingStatus }}</strong>
</div>
</div>
</section>
<!-- Member Allocations -->
@if (result.allocations.length > 0) {
<section class="mb-4">
<h2 class="h5 mb-3">Member Allocations</h2>
<table mat-table [dataSource]="result.allocations" class="w-100">
<ng-container matColumnDef="entityName">
<th mat-header-cell *matHeaderCellDef>Member</th>
<td mat-cell *matCellDef="let a">{{ a.entityName }}</td>
</ng-container>
<ng-container matColumnDef="ownershipPercent">
<th mat-header-cell *matHeaderCellDef>Ownership %</th>
<td mat-cell *matCellDef="let a">{{ a.ownershipPercent }}%</td>
</ng-container>
<ng-container matColumnDef="allocatedValues">
<th mat-header-cell *matHeaderCellDef>Key Values</th>
<td mat-cell *matCellDef="let a">
@if (a.allocatedValues['1'] !== undefined) {
Box 1: {{ a.allocatedValues['1'] | number:'1.2-2' }}
}
</td>
</ng-container>
<tr mat-header-row *matHeaderRowDef="allocationColumns"></tr>
<tr mat-row *matRowDef="let row; columns: allocationColumns"></tr>
</table>
</section>
}
<!-- Distributions -->
@if (result.distributions.length > 0) {
<section class="mb-4">
<h2 class="h5 mb-3">Distribution Records</h2>
<table mat-table [dataSource]="result.distributions" class="w-100">
<ng-container matColumnDef="entityId">
<th mat-header-cell *matHeaderCellDef>Member</th>
<td mat-cell *matCellDef="let d">{{ d.entityId }}</td>
</ng-container>
<ng-container matColumnDef="type">
<th mat-header-cell *matHeaderCellDef>Type</th>
<td mat-cell *matCellDef="let d">{{ d.type }}</td>
</ng-container>
<ng-container matColumnDef="amount">
<th mat-header-cell *matHeaderCellDef>Amount</th>
<td mat-cell *matCellDef="let d">{{ d.amount | number:'1.2-2' }}</td>
</ng-container>
<ng-container matColumnDef="date">
<th mat-header-cell *matHeaderCellDef>Date</th>
<td mat-cell *matCellDef="let d">{{ d.date | date:'mediumDate' }}</td>
</ng-container>
<tr mat-header-row *matHeaderRowDef="distributionColumns"></tr>
<tr mat-row *matRowDef="let row; columns: distributionColumns"></tr>
</table>
</section>
}
<!-- Linked Document -->
@if (result.document) {
<section class="mb-4">
<h2 class="h5 mb-3">Linked Document</h2>
<div class="summary-card p-3">
<div class="d-flex justify-content-between">
<span>File</span>
<strong>{{ result.document.name }}</strong>
</div>
<div class="d-flex justify-content-between">
<span>Type</span>
<strong>{{ result.document.type }}</strong>
</div>
</div>
</section>
}
<!-- Actions -->
<div class="actions d-flex justify-content-between mt-4">
<button mat-stroked-button (click)="goToImportList()">
Back to Import List
</button>
<button mat-flat-button color="primary" (click)="viewKDocument()">
View KDocument
</button>
</div>
} @else {
<!-- Confirmation Form -->
<h1 class="h3 mb-4 text-center">Confirm K-1 Import</h1>
<div class="confirmation-form mx-auto">
<div class="mb-3">
<mat-form-field class="w-100">
<mat-label>Filing Status</mat-label>
<mat-select [(ngModel)]="filingStatus">
@for (status of filingStatusOptions; track status) {
<mat-option [value]="status">{{ status }}</mat-option>
}
</mat-select>
</mat-form-field>
</div>
@if (hasConflict) {
<div class="mb-3">
<mat-form-field class="w-100">
<mat-label>Existing KDocument Action</mat-label>
<mat-select [(ngModel)]="existingKDocumentAction">
<mat-option value="UPDATE">Update existing</mat-option>
<mat-option value="CREATE_NEW">Create new version</mat-option>
</mat-select>
</mat-form-field>
</div>
}
<div class="d-flex justify-content-between">
<button mat-stroked-button (click)="goBackToVerify()">
Back to Verify
</button>
<button
mat-flat-button
color="primary"
[disabled]="isConfirming || (hasConflict && !existingKDocumentAction)"
(click)="confirmImport()">
@if (isConfirming) {
Confirming...
} @else {
Confirm & Create KDocument
}
</button>
</div>
</div>
}
</div>
</div>
</div>

37
apps/client/src/app/pages/k1-import/k1-confirmation/k1-confirmation.scss

@ -0,0 +1,37 @@
:host {
display: block;
}
.confirmation-form {
max-width: 480px;
}
.summary-card {
border: 1px solid var(--border-color, #e0e0e0);
border-radius: 8px;
> div {
padding: 4px 0;
border-bottom: 1px solid var(--border-color, #f0f0f0);
&:last-child {
border-bottom: none;
}
}
}
.text-success {
color: #4caf50;
}
.alert-danger {
background-color: rgba(244, 67, 54, 0.1);
border: 1px solid rgba(244, 67, 54, 0.3);
border-radius: 4px;
color: #f44336;
padding: 12px 16px;
}
.actions {
padding-bottom: 2rem;
}

269
apps/client/src/app/pages/k1-import/k1-import-page.component.ts

@ -0,0 +1,269 @@
import { K1ImportDataService } from '@ghostfolio/client/services/k1-import-data.service';
import { FamilyOfficeDataService } from '@ghostfolio/client/services/family-office-data.service';
import { CommonModule } from '@angular/common';
import {
ChangeDetectionStrategy,
ChangeDetectorRef,
Component,
DestroyRef,
OnDestroy,
OnInit
} from '@angular/core';
import { takeUntilDestroyed } from '@angular/core/rxjs-interop';
import { FormsModule } from '@angular/forms';
import { MatButtonModule } from '@angular/material/button';
import { MatFormFieldModule } from '@angular/material/form-field';
import { MatIconModule } from '@angular/material/icon';
import { MatProgressBarModule } from '@angular/material/progress-bar';
import { MatSelectModule } from '@angular/material/select';
import { MatTableModule } from '@angular/material/table';
import { MatTooltipModule } from '@angular/material/tooltip';
import { Router, RouterModule } from '@angular/router';
import { addIcons } from 'ionicons';
import {
cloudUploadOutline,
documentTextOutline
} from 'ionicons/icons';
@Component({
changeDetection: ChangeDetectionStrategy.OnPush,
host: { class: 'page' },
imports: [
CommonModule,
FormsModule,
MatButtonModule,
MatFormFieldModule,
MatIconModule,
MatProgressBarModule,
MatSelectModule,
MatTableModule,
MatTooltipModule,
RouterModule
],
selector: 'gf-k1-import-page',
styleUrls: ['./k1-import-page.scss'],
templateUrl: './k1-import-page.html'
})
export class K1ImportPageComponent implements OnDestroy, OnInit {
public error: string | null = null;
public extractionStatus: string | null = null;
public historyColumns = ['createdAt', 'fileName', 'taxYear', 'status', 'kDocument', 'actions'];
public importHistory: any[] = [];
public isUploading = false;
public partnerships: Array<{ id: string; name: string }> = [];
public selectedFile: File | null = null;
public selectedPartnershipId = '';
public sessionId: string | null = null;
public taxYear: number;
public taxYearOptions: number[] = [];
public uploadProgress = 0;
private pollingInterval: any = null;
public constructor(
private readonly changeDetectorRef: ChangeDetectorRef,
private readonly destroyRef: DestroyRef,
private readonly familyOfficeDataService: FamilyOfficeDataService,
private readonly k1ImportDataService: K1ImportDataService,
private readonly router: Router
) {
addIcons({ cloudUploadOutline, documentTextOutline });
const currentYear = new Date().getFullYear();
this.taxYear = currentYear - 1;
for (let y = currentYear; y >= currentYear - 10; y--) {
this.taxYearOptions.push(y);
}
}
public ngOnDestroy(): void {
this.stopPolling();
}
public ngOnInit(): void {
this.fetchPartnerships();
}
public onPartnershipChange(): void {
if (this.selectedPartnershipId) {
this.loadImportHistory();
}
}
public loadImportHistory(): void {
if (!this.selectedPartnershipId) {
return;
}
this.k1ImportDataService
.fetchImportHistory({ partnershipId: this.selectedPartnershipId })
.pipe(takeUntilDestroyed(this.destroyRef))
.subscribe({
next: (history) => {
this.importHistory = history;
this.changeDetectorRef.markForCheck();
},
error: () => {
this.importHistory = [];
this.changeDetectorRef.markForCheck();
}
});
}
public reprocessSession(sessionId: string): void {
this.k1ImportDataService
.reprocessImportSession(sessionId)
.pipe(takeUntilDestroyed(this.destroyRef))
.subscribe({
next: (result) => {
this.sessionId = result.id;
this.extractionStatus = 'Processing...';
this.changeDetectorRef.markForCheck();
this.startPolling(result.id);
},
error: (err) => {
this.error =
err?.error?.message || err?.message || 'Re-processing failed.';
this.changeDetectorRef.markForCheck();
}
});
}
public onFileSelected(event: Event): void {
let file: File | null = null;
if (event instanceof DragEvent && event.dataTransfer?.files?.length) {
file = event.dataTransfer.files[0];
} else {
const input = event.target as HTMLInputElement;
if (input.files && input.files.length > 0) {
file = input.files[0];
}
}
if (!file) {
return;
}
// Client-side validation
if (file.type !== 'application/pdf') {
this.error = 'Please select a valid PDF file.';
this.selectedFile = null;
this.changeDetectorRef.markForCheck();
return;
}
if (file.size > 25 * 1024 * 1024) {
this.error = 'File exceeds 25 MB size limit.';
this.selectedFile = null;
this.changeDetectorRef.markForCheck();
return;
}
this.error = null;
this.selectedFile = file;
this.changeDetectorRef.markForCheck();
}
public uploadK1(): void {
if (!this.selectedFile || !this.selectedPartnershipId || !this.taxYear) {
this.error = 'Please select a partnership, tax year, and PDF file.';
this.changeDetectorRef.markForCheck();
return;
}
this.isUploading = true;
this.error = null;
this.extractionStatus = 'Uploading...';
this.changeDetectorRef.markForCheck();
const formData = new FormData();
formData.append('file', this.selectedFile);
formData.append('partnershipId', this.selectedPartnershipId);
formData.append('taxYear', this.taxYear.toString());
this.k1ImportDataService
.uploadK1(formData)
.pipe(takeUntilDestroyed(this.destroyRef))
.subscribe({
next: (result) => {
this.sessionId = result.id;
this.extractionStatus = 'Processing...';
this.isUploading = false;
this.changeDetectorRef.markForCheck();
// Start polling for extraction completion
this.startPolling(result.id);
},
error: (err) => {
this.isUploading = false;
this.error =
err?.error?.message || err?.message || 'Upload failed.';
this.extractionStatus = null;
this.changeDetectorRef.markForCheck();
}
});
}
public resetForm(): void {
this.selectedFile = null;
this.sessionId = null;
this.extractionStatus = null;
this.error = null;
this.stopPolling();
this.changeDetectorRef.markForCheck();
}
private fetchPartnerships(): void {
this.familyOfficeDataService
.fetchPartnerships()
.pipe(takeUntilDestroyed(this.destroyRef))
.subscribe({
next: (partnerships) => {
this.partnerships = partnerships.map((p) => ({
id: p.id,
name: p.name
}));
this.changeDetectorRef.markForCheck();
}
});
}
private startPolling(sessionId: string): void {
this.stopPolling();
this.pollingInterval = setInterval(() => {
this.k1ImportDataService
.fetchImportSession(sessionId)
.pipe(takeUntilDestroyed(this.destroyRef))
.subscribe({
next: (session) => {
this.extractionStatus = session.status;
if (session.status === 'EXTRACTED') {
this.stopPolling();
// Navigate to verification page (to be created in Phase 4)
this.router.navigate(['/k1-import', sessionId, 'verify']);
} else if (session.status === 'FAILED') {
this.stopPolling();
this.error =
session.errorMessage || 'Extraction failed.';
this.extractionStatus = 'FAILED';
}
this.changeDetectorRef.markForCheck();
},
error: () => {
// Continue polling on transient errors
}
});
}, 2000); // Poll every 2 seconds
}
private stopPolling(): void {
if (this.pollingInterval) {
clearInterval(this.pollingInterval);
this.pollingInterval = null;
}
}
}

162
apps/client/src/app/pages/k1-import/k1-import-page.html

@ -0,0 +1,162 @@
<div class="container">
<div class="row">
<div class="col">
<h1 class="d-none d-sm-block h3 mb-4 text-center">K-1 PDF Import</h1>
@if (error) {
<div class="alert alert-danger mb-3">
{{ error }}
</div>
}
@if (!sessionId) {
<!-- Upload Form -->
<div class="upload-form mx-auto">
<div class="mb-3">
<mat-form-field class="w-100">
<mat-label>Partnership</mat-label>
<mat-select [(ngModel)]="selectedPartnershipId" (selectionChange)="onPartnershipChange()">
@for (p of partnerships; track p.id) {
<mat-option [value]="p.id">{{ p.name }}</mat-option>
}
</mat-select>
</mat-form-field>
</div>
<div class="mb-3">
<mat-form-field class="w-100">
<mat-label>Tax Year</mat-label>
<mat-select [(ngModel)]="taxYear">
@for (y of taxYearOptions; track y) {
<mat-option [value]="y">{{ y }}</mat-option>
}
</mat-select>
</mat-form-field>
</div>
<div class="mb-3">
<div class="upload-dropzone text-center p-4"
(click)="fileInput.click()"
(dragover)="$event.preventDefault()"
(drop)="$event.preventDefault(); onFileSelected($event)">
<input #fileInput
accept="application/pdf"
hidden
type="file"
(change)="onFileSelected($event)" />
@if (selectedFile) {
<ion-icon name="document-text-outline" size="large"></ion-icon>
<p class="mt-2 mb-0">{{ selectedFile.name }}</p>
<small class="text-muted">{{ (selectedFile.size / 1024 / 1024).toFixed(2) }} MB</small>
} @else {
<ion-icon name="cloud-upload-outline" size="large"></ion-icon>
<p class="mt-2 mb-0">Click or drag a K-1 PDF file here</p>
<small class="text-muted">Maximum 25 MB</small>
}
</div>
</div>
<button
[disabled]="!selectedFile || !selectedPartnershipId || !taxYear || isUploading"
class="w-100"
color="primary"
mat-flat-button
(click)="uploadK1()">
@if (isUploading) {
Uploading...
} @else {
Upload & Scan K-1
}
</button>
</div>
} @else {
<!-- Extraction Progress -->
<div class="processing-status text-center mx-auto">
<h3>Processing K-1</h3>
@if (extractionStatus === 'Processing...' || extractionStatus === 'PROCESSING') {
<mat-progress-bar mode="indeterminate"></mat-progress-bar>
<p class="mt-3">Extracting data from your K-1 PDF...</p>
<p class="text-muted">This usually takes less than 30 seconds.</p>
} @else if (extractionStatus === 'EXTRACTED') {
<p class="text-success">Extraction complete! Redirecting to verification...</p>
} @else if (extractionStatus === 'FAILED') {
<p class="text-danger">Extraction failed.</p>
}
<button
class="mt-3"
color="warn"
mat-stroked-button
(click)="resetForm()">
Cancel & Start Over
</button>
</div>
}
</div>
</div>
<!-- Import History -->
@if (importHistory.length > 0) {
<div class="row mt-4">
<div class="col">
<h3 class="h5 mb-3">Import History</h3>
<table mat-table [dataSource]="importHistory" class="w-100">
<ng-container matColumnDef="createdAt">
<th mat-header-cell *matHeaderCellDef>Date</th>
<td mat-cell *matCellDef="let row">{{ row.createdAt | date:'short' }}</td>
</ng-container>
<ng-container matColumnDef="fileName">
<th mat-header-cell *matHeaderCellDef>File</th>
<td mat-cell *matCellDef="let row">{{ row.fileName }}</td>
</ng-container>
<ng-container matColumnDef="taxYear">
<th mat-header-cell *matHeaderCellDef>Tax Year</th>
<td mat-cell *matCellDef="let row">{{ row.taxYear }}</td>
</ng-container>
<ng-container matColumnDef="status">
<th mat-header-cell *matHeaderCellDef>Status</th>
<td mat-cell *matCellDef="let row">
<span class="badge"
[class.badge-success]="row.status === 'CONFIRMED'"
[class.badge-warning]="row.status === 'EXTRACTED' || row.status === 'VERIFIED'"
[class.badge-danger]="row.status === 'FAILED' || row.status === 'CANCELLED'"
[class.badge-info]="row.status === 'PROCESSING'">
{{ row.status }}
</span>
</td>
</ng-container>
<ng-container matColumnDef="kDocument">
<th mat-header-cell *matHeaderCellDef>K-Document</th>
<td mat-cell *matCellDef="let row">
@if (row.kDocumentId) {
<a [routerLink]="['/k-documents']">View</a>
} @else {
}
</td>
</ng-container>
<ng-container matColumnDef="actions">
<th mat-header-cell *matHeaderCellDef>Actions</th>
<td mat-cell *matCellDef="let row">
<button mat-icon-button
(click)="reprocessSession(row.id)"
matTooltip="Re-process with current cell mapping">
<mat-icon>refresh</mat-icon>
</button>
</td>
</ng-container>
<tr mat-header-row *matHeaderRowDef="historyColumns"></tr>
<tr mat-row *matRowDef="let row; columns: historyColumns;"></tr>
</table>
</div>
</div>
}
</div>

33
apps/client/src/app/pages/k1-import/k1-import-page.routes.ts

@ -0,0 +1,33 @@
import { AuthGuard } from '@ghostfolio/client/core/auth.guard';
import { Routes } from '@angular/router';
export const routes: Routes = [
{
canActivate: [AuthGuard],
loadComponent: () =>
import('./k1-import-page.component').then(
(c) => c.K1ImportPageComponent
),
path: '',
title: 'K-1 Import'
},
{
canActivate: [AuthGuard],
loadComponent: () =>
import('./k1-verification/k1-verification.component').then(
(c) => c.K1VerificationComponent
),
path: ':id/verify',
title: 'Verify K-1 Import'
},
{
canActivate: [AuthGuard],
loadComponent: () =>
import('./k1-confirmation/k1-confirmation.component').then(
(c) => c.K1ConfirmationComponent
),
path: ':id/confirm',
title: 'Confirm K-1 Import'
}
];

47
apps/client/src/app/pages/k1-import/k1-import-page.scss

@ -0,0 +1,47 @@
:host {
display: block;
}
.upload-form {
max-width: 480px;
}
.upload-dropzone {
border: 2px dashed var(--border-color, #ccc);
border-radius: 8px;
cursor: pointer;
transition: border-color 0.2s ease;
&:hover {
border-color: var(--primary-color, #1976d2);
}
ion-icon {
font-size: 48px;
color: var(--text-muted, #999);
}
}
.processing-status {
max-width: 480px;
mat-progress-bar {
margin-top: 1rem;
}
}
.text-success {
color: #4caf50;
}
.text-danger {
color: #f44336;
}
.alert-danger {
background-color: rgba(244, 67, 54, 0.1);
border: 1px solid rgba(244, 67, 54, 0.3);
border-radius: 4px;
color: #f44336;
padding: 12px 16px;
}

434
apps/client/src/app/pages/k1-import/k1-verification/k1-verification.component.ts

@ -0,0 +1,434 @@
import { K1ImportDataService } from '@ghostfolio/client/services/k1-import-data.service';
import type {
K1AggregationResult,
K1ExtractedField,
K1UnmappedItem
} from '@ghostfolio/common/interfaces';
import { CommonModule } from '@angular/common';
import {
ChangeDetectionStrategy,
ChangeDetectorRef,
Component,
DestroyRef,
OnInit
} from '@angular/core';
import { takeUntilDestroyed } from '@angular/core/rxjs-interop';
import { FormsModule } from '@angular/forms';
import { MatButtonModule } from '@angular/material/button';
import { MatCheckboxModule } from '@angular/material/checkbox';
import { MatFormFieldModule } from '@angular/material/form-field';
import { MatIconModule } from '@angular/material/icon';
import { MatInputModule } from '@angular/material/input';
import { MatProgressBarModule } from '@angular/material/progress-bar';
import { MatSelectModule } from '@angular/material/select';
import { MatTableModule } from '@angular/material/table';
import { MatTooltipModule } from '@angular/material/tooltip';
import { ActivatedRoute, Router } from '@angular/router';
import { addIcons } from 'ionicons';
import {
checkmarkCircleOutline,
alertCircleOutline,
closeCircleOutline,
trashOutline
} from 'ionicons/icons';
interface EditableField extends K1ExtractedField {
isEditing: boolean;
editValue: string;
editLabel: string;
cellType: string;
editCellType: string;
}
interface EditableUnmappedItem extends K1UnmappedItem {
resolution: 'assigned' | 'discarded' | null;
assignedBoxNumber: string | null;
}
@Component({
changeDetection: ChangeDetectionStrategy.OnPush,
host: { class: 'page' },
imports: [
CommonModule,
FormsModule,
MatButtonModule,
MatCheckboxModule,
MatFormFieldModule,
MatIconModule,
MatInputModule,
MatProgressBarModule,
MatSelectModule,
MatTableModule,
MatTooltipModule
],
selector: 'gf-k1-verification',
styleUrls: ['./k1-verification.scss'],
templateUrl: './k1-verification.html'
})
export class K1VerificationComponent implements OnInit {
public aggregations: K1AggregationResult[] = [];
public canConfirm = false;
public error: string | null = null;
public fields: EditableField[] = [];
public isLoading = true;
public isSaving = false;
public sessionId: string;
public taxYear: number;
public unmappedItems: EditableUnmappedItem[] = [];
public cellTypeOptions = [
{ value: 'number', label: 'Number ($)' },
{ value: 'string', label: 'String' },
{ value: 'percentage', label: 'Percentage (%)' },
{ value: 'boolean', label: 'Boolean' }
];
// Column definitions for the fields table
public displayedColumns = [
'boxNumber',
'label',
'rawValue',
'numericValue',
'cellType',
'confidence',
'reviewed',
'actions'
];
// Available box numbers for assigning unmapped items
public availableBoxNumbers: string[] = [];
public constructor(
private readonly activatedRoute: ActivatedRoute,
private readonly changeDetectorRef: ChangeDetectorRef,
private readonly destroyRef: DestroyRef,
private readonly k1ImportDataService: K1ImportDataService,
private readonly router: Router
) {
addIcons({
checkmarkCircleOutline,
alertCircleOutline,
closeCircleOutline,
trashOutline
});
}
public ngOnInit(): void {
this.sessionId = this.activatedRoute.snapshot.params['id'];
this.loadSession();
}
/**
* Get confidence badge CSS class.
*/
public getConfidenceClass(level: string): string {
switch (level) {
case 'HIGH':
return 'confidence-high';
case 'MEDIUM':
return 'confidence-medium';
case 'LOW':
return 'confidence-low';
default:
return '';
}
}
/**
* Toggle inline editing for a field.
*/
public startEditing(field: EditableField): void {
field.isEditing = true;
field.editValue = field.rawValue;
field.editLabel = field.customLabel || field.label;
field.editCellType = field.cellType;
this.changeDetectorRef.markForCheck();
}
/**
* Save edits to a field.
*/
public saveEdit(field: EditableField): void {
field.rawValue = field.editValue;
field.customLabel =
field.editLabel !== field.label ? field.editLabel : null;
field.cellType = field.editCellType;
field.isUserEdited = true;
field.isReviewed = true;
field.isEditing = false;
// Parse value based on cell type
if (field.cellType === 'boolean') {
const lower = field.editValue.toLowerCase().trim();
field.numericValue = null;
field.rawValue = (lower === 'true' || lower === 'yes' || lower === '1' || lower === 'x') ? 'true' : 'false';
} else if (field.cellType === 'string') {
field.numericValue = null;
} else {
// number or percentage
const cleaned = field.editValue
.replace(/[$,%]/g, '')
.replace(/\(([^)]+)\)/, '-$1')
.trim();
const parsed = parseFloat(cleaned);
field.numericValue = isNaN(parsed) ? null : parsed;
}
this.recalculateAggregations();
this.checkConfirmability();
this.changeDetectorRef.markForCheck();
}
/**
* Cancel editing.
*/
public cancelEdit(field: EditableField): void {
field.isEditing = false;
this.changeDetectorRef.markForCheck();
}
/**
* Toggle reviewed flag for a field.
*/
public toggleReviewed(field: EditableField): void {
field.isReviewed = !field.isReviewed;
this.checkConfirmability();
this.changeDetectorRef.markForCheck();
}
/**
* Assign an unmapped item to an existing box number.
*/
public assignUnmappedItem(
item: EditableUnmappedItem,
boxNumber: string
): void {
item.resolution = 'assigned';
item.assignedBoxNumber = boxNumber;
this.checkConfirmability();
this.changeDetectorRef.markForCheck();
}
/**
* Discard an unmapped item.
*/
public discardUnmappedItem(item: EditableUnmappedItem): void {
item.resolution = 'discarded';
item.assignedBoxNumber = null;
this.checkConfirmability();
this.changeDetectorRef.markForCheck();
}
/**
* Submit verified data.
*/
public submitVerification(): void {
if (!this.canConfirm) {
return;
}
this.isSaving = true;
this.error = null;
this.changeDetectorRef.markForCheck();
const data = {
taxYear: this.taxYear,
fields: this.fields.map((f) => ({
boxNumber: f.boxNumber,
label: f.label,
customLabel: f.customLabel,
rawValue: f.rawValue,
numericValue: f.numericValue,
cellType: f.cellType,
confidence: f.confidence,
confidenceLevel: f.confidenceLevel,
isUserEdited: f.isUserEdited,
isReviewed: f.isReviewed
})),
unmappedItems: this.unmappedItems.map((item) => ({
rawLabel: item.rawLabel,
rawValue: item.rawValue,
numericValue: item.numericValue,
confidence: item.confidence,
pageNumber: item.pageNumber,
resolution: item.resolution,
assignedBoxNumber: item.assignedBoxNumber
}))
};
this.k1ImportDataService
.verifyImportSession(this.sessionId, data as any)
.pipe(takeUntilDestroyed(this.destroyRef))
.subscribe({
next: () => {
this.isSaving = false;
// Navigate to confirmation step (Phase 5)
this.router.navigate(['/k1-import', this.sessionId, 'confirm']);
},
error: (err) => {
this.isSaving = false;
this.error =
err?.error?.message || err?.message || 'Verification failed.';
this.changeDetectorRef.markForCheck();
}
});
}
/**
* Cancel and go back to import page.
*/
public cancelImport(): void {
this.k1ImportDataService
.cancelImportSession(this.sessionId)
.pipe(takeUntilDestroyed(this.destroyRef))
.subscribe({
next: () => {
this.router.navigate(['/k1-import']);
},
error: (err) => {
this.error =
err?.error?.message || err?.message || 'Cancel failed.';
this.changeDetectorRef.markForCheck();
}
});
}
/**
* Load session data and populate fields.
*/
private loadSession(): void {
this.k1ImportDataService
.fetchImportSession(this.sessionId)
.pipe(takeUntilDestroyed(this.destroyRef))
.subscribe({
next: (session: any) => {
if (
session.status !== 'EXTRACTED' &&
session.status !== 'VERIFIED'
) {
this.error = `Session is in ${session.status} status. Cannot verify.`;
this.isLoading = false;
this.changeDetectorRef.markForCheck();
return;
}
this.taxYear = session.taxYear;
const extraction = session.rawExtraction || session.verifiedData;
if (extraction) {
this.fields = (extraction.fields || []).map(
(f: K1ExtractedField) => ({
...f,
isEditing: false,
editValue: f.rawValue,
editLabel: f.customLabel || f.label,
cellType: (f as any).cellType || 'number',
editCellType: (f as any).cellType || 'number'
})
);
this.unmappedItems = (extraction.unmappedItems || []).map(
(item: K1UnmappedItem) => ({
...item,
resolution: item.resolution || null,
assignedBoxNumber: item.assignedBoxNumber || null
})
);
// Build available box numbers from fields
this.availableBoxNumbers = this.fields.map((f) => f.boxNumber);
}
this.recalculateAggregations();
this.checkConfirmability();
this.isLoading = false;
this.changeDetectorRef.markForCheck();
},
error: (err) => {
this.error =
err?.error?.message || err?.message || 'Failed to load session.';
this.isLoading = false;
this.changeDetectorRef.markForCheck();
}
});
}
/**
* Recalculate aggregation summaries from current field values.
* FR-034: Auto-recalculate when cell values change.
*/
private recalculateAggregations(): void {
// Use the data service to compute aggregations from current fields
// For now, compute client-side from the predefined rules
// The full server-side computation will be used when a KDocument exists
const fieldMap: Record<string, number> = {};
for (const f of this.fields) {
if (f.numericValue !== null && f.numericValue !== undefined) {
fieldMap[f.boxNumber] = f.numericValue;
}
}
// Client-side aggregation matching the default rules
this.aggregations = [
{
ruleId: 'client-1',
name: 'Total Ordinary Income',
operation: 'SUM',
sourceCells: ['1'],
computedValue: fieldMap['1'] ?? 0,
breakdown: { '1': fieldMap['1'] ?? 0 }
},
{
ruleId: 'client-2',
name: 'Total Capital Gains',
operation: 'SUM',
sourceCells: ['8', '9a', '9b', '9c', '10'],
computedValue: ['8', '9a', '9b', '9c', '10'].reduce(
(sum, box) => sum + (fieldMap[box] ?? 0),
0
),
breakdown: Object.fromEntries(
['8', '9a', '9b', '9c', '10'].map((box) => [
box,
fieldMap[box] ?? 0
])
)
},
{
ruleId: 'client-3',
name: 'Total Deductions',
operation: 'SUM',
sourceCells: ['12', '13'],
computedValue: (fieldMap['12'] ?? 0) + (fieldMap['13'] ?? 0),
breakdown: {
'12': fieldMap['12'] ?? 0,
'13': fieldMap['13'] ?? 0
}
}
];
}
/**
* FR-035: Check if all medium/low-confidence fields are reviewed
* AND all unmapped items are resolved.
*/
private checkConfirmability(): void {
// All medium/low fields must be reviewed
const allFieldsReviewed = this.fields.every(
(f) =>
f.confidenceLevel === 'HIGH' ||
f.isReviewed
);
// All unmapped items must be resolved
const allUnmappedResolved =
this.unmappedItems.length === 0 ||
this.unmappedItems.every(
(item) =>
item.resolution === 'assigned' || item.resolution === 'discarded'
);
this.canConfirm = allFieldsReviewed && allUnmappedResolved;
}
}

264
apps/client/src/app/pages/k1-import/k1-verification/k1-verification.html

@ -0,0 +1,264 @@
<div class="container">
<div class="row">
<div class="col">
<h1 class="d-none d-sm-block h3 mb-4 text-center">
Verify K-1 Extraction ({{ taxYear }})
</h1>
@if (error) {
<div class="alert alert-danger mb-3">
{{ error }}
</div>
}
@if (isLoading) {
<mat-progress-bar mode="indeterminate"></mat-progress-bar>
} @else {
<!-- Extracted Fields Table -->
<section class="fields-section mb-4">
<h2 class="h5 mb-3">Extracted Values</h2>
<div class="table-responsive">
<table mat-table [dataSource]="fields" class="w-100">
<!-- Box Number Column -->
<ng-container matColumnDef="boxNumber">
<th mat-header-cell *matHeaderCellDef>Box</th>
<td mat-cell *matCellDef="let field">
<strong>{{ field.boxNumber }}</strong>
</td>
</ng-container>
<!-- Label Column -->
<ng-container matColumnDef="label">
<th mat-header-cell *matHeaderCellDef>Label</th>
<td mat-cell *matCellDef="let field">
@if (field.isEditing) {
<mat-form-field class="compact-field">
<input matInput [(ngModel)]="field.editLabel" />
</mat-form-field>
} @else {
<span>{{ field.customLabel || field.label }}</span>
@if (field.customLabel) {
<small class="text-muted d-block">
(original: {{ field.label }})
</small>
}
}
</td>
</ng-container>
<!-- Raw Value Column -->
<ng-container matColumnDef="rawValue">
<th mat-header-cell *matHeaderCellDef>Value</th>
<td mat-cell *matCellDef="let field">
@if (field.isEditing) {
<mat-form-field class="compact-field">
<input matInput [(ngModel)]="field.editValue" />
</mat-form-field>
} @else {
<span [class.user-edited]="field.isUserEdited">
{{ field.rawValue }}
</span>
}
</td>
</ng-container>
<!-- Numeric Value Column -->
<ng-container matColumnDef="numericValue">
<th mat-header-cell *matHeaderCellDef>Parsed</th>
<td mat-cell *matCellDef="let field">
@if (field.numericValue !== null && field.numericValue !== undefined) {
{{ field.numericValue | number:'1.2-6' }}
} @else if (field.rawValue) {
{{ field.rawValue }}
} @else {
<span class="text-muted"></span>
}
</td>
</ng-container>
<!-- Cell Type Column -->
<ng-container matColumnDef="cellType">
<th mat-header-cell *matHeaderCellDef>Type</th>
<td mat-cell *matCellDef="let field">
@if (field.isEditing) {
<mat-select class="type-select" [(ngModel)]="field.editCellType">
@for (opt of cellTypeOptions; track opt.value) {
<mat-option [value]="opt.value">{{ opt.label }}</mat-option>
}
</mat-select>
} @else {
<span class="type-badge type-{{ field.cellType }}">{{ field.cellType }}</span>
}
</td>
</ng-container>
<!-- Confidence Column -->
<ng-container matColumnDef="confidence">
<th mat-header-cell *matHeaderCellDef>Confidence</th>
<td mat-cell *matCellDef="let field">
<span
class="confidence-badge"
[ngClass]="getConfidenceClass(field.confidenceLevel)"
[matTooltip]="(field.confidence * 100).toFixed(0) + '%'">
{{ field.confidenceLevel }}
</span>
</td>
</ng-container>
<!-- Reviewed Column -->
<ng-container matColumnDef="reviewed">
<th mat-header-cell *matHeaderCellDef>Reviewed</th>
<td mat-cell *matCellDef="let field">
<mat-checkbox
[checked]="field.isReviewed"
[disabled]="field.confidenceLevel === 'HIGH'"
(change)="toggleReviewed(field)">
</mat-checkbox>
</td>
</ng-container>
<!-- Actions Column -->
<ng-container matColumnDef="actions">
<th mat-header-cell *matHeaderCellDef></th>
<td mat-cell *matCellDef="let field">
@if (field.isEditing) {
<button mat-icon-button color="primary" (click)="saveEdit(field)"
matTooltip="Save">
<mat-icon>check</mat-icon>
</button>
<button mat-icon-button (click)="cancelEdit(field)"
matTooltip="Cancel">
<mat-icon>close</mat-icon>
</button>
} @else {
<button mat-icon-button (click)="startEditing(field)"
matTooltip="Edit">
<mat-icon>edit</mat-icon>
</button>
}
</td>
</ng-container>
<tr mat-header-row *matHeaderRowDef="displayedColumns"></tr>
<tr mat-row *matRowDef="let row; columns: displayedColumns"
[ngClass]="{
'row-low': row.confidenceLevel === 'LOW' && !row.isReviewed,
'row-medium': row.confidenceLevel === 'MEDIUM' && !row.isReviewed
}">
</tr>
</table>
</div>
</section>
<!-- Unmapped Items Section (FR-037, FR-038) -->
@if (unmappedItems.length > 0) {
<section class="unmapped-section mb-4">
<h2 class="h5 mb-3">
Unmapped Items
<small class="text-muted">({{ unmappedItems.length }} items)</small>
</h2>
<div class="unmapped-list">
@for (item of unmappedItems; track item.rawLabel) {
<div class="unmapped-item p-3 mb-2" [ngClass]="{
'resolved': item.resolution !== null
}">
<div class="d-flex justify-content-between align-items-start">
<div>
<strong>{{ item.rawLabel }}</strong>
<span class="ms-2">{{ item.rawValue }}</span>
@if (item.numericValue !== null) {
<small class="text-muted ms-1">
({{ item.numericValue | number:'1.2-6' }})
</small>
}
<small class="text-muted d-block">Page {{ item.pageNumber }}</small>
</div>
<div class="unmapped-actions d-flex align-items-center gap-2">
@if (item.resolution === null) {
<mat-form-field class="compact-field">
<mat-label>Assign to box</mat-label>
<mat-select (selectionChange)="assignUnmappedItem(item, $event.value)">
@for (box of availableBoxNumbers; track box) {
<mat-option [value]="box">Box {{ box }}</mat-option>
}
</mat-select>
</mat-form-field>
<button mat-icon-button color="warn"
matTooltip="Discard"
(click)="discardUnmappedItem(item)">
<mat-icon>delete</mat-icon>
</button>
} @else {
<span class="resolution-badge">
@if (item.resolution === 'assigned') {
Assigned to Box {{ item.assignedBoxNumber }}
} @else {
Discarded
}
</span>
}
</div>
</div>
</div>
}
</div>
</section>
}
<!-- Aggregation Summary (FR-033, FR-034) -->
@if (aggregations.length > 0) {
<section class="aggregation-section mb-4">
<h2 class="h5 mb-3">Aggregation Summary</h2>
<div class="aggregation-list">
@for (agg of aggregations; track agg.ruleId) {
<div class="aggregation-row d-flex justify-content-between p-2">
<div>
<strong>{{ agg.name }}</strong>
<small class="text-muted ms-2">
({{ agg.operation }} of
@for (box of agg.sourceCells; track box; let last = $last) {
Box {{ box }}@if (!last) {, }
})
</small>
</div>
<div class="aggregation-value">
<strong>{{ agg.computedValue | number:'1.2-6' }}</strong>
</div>
</div>
}
</div>
</section>
}
<!-- Review Status Banner (FR-035) -->
@if (!canConfirm) {
<div class="alert alert-warning mb-3">
<strong>Review Required:</strong> Please review all medium/low-confidence
fields and resolve all unmapped items before submitting.
</div>
}
<!-- Action Buttons -->
<div class="actions d-flex justify-content-between mt-4">
<button mat-stroked-button color="warn" (click)="cancelImport()">
Cancel Import
</button>
<button
mat-flat-button
color="primary"
[disabled]="!canConfirm || isSaving"
(click)="submitVerification()">
@if (isSaving) {
Saving...
} @else {
Confirm & Continue
}
</button>
</div>
}
</div>
</div>
</div>

199
apps/client/src/app/pages/k1-import/k1-verification/k1-verification.scss

@ -0,0 +1,199 @@
:host {
display: block;
}
// Column width hints give Label the most room, keep Box and actions compact
.mat-column-boxNumber {
width: 60px;
white-space: nowrap;
}
.mat-column-label {
min-width: 200px;
width: 28%;
}
.mat-column-rawValue {
min-width: 140px;
width: 18%;
}
.mat-column-numericValue {
white-space: nowrap;
}
.mat-column-cellType {
width: 1%;
white-space: nowrap;
}
.mat-column-confidence {
width: 90px;
white-space: nowrap;
}
.mat-column-reviewed {
width: 60px;
}
.mat-column-actions {
width: 80px;
white-space: nowrap;
}
.fields-section {
.table-responsive {
overflow-x: auto;
}
.compact-field {
width: 100%;
min-width: 120px;
.mat-mdc-form-field-infix {
padding: 4px 0;
}
input.mat-mdc-input-element {
font-size: 0.8125rem;
}
}
.user-edited {
font-style: italic;
color: var(--primary-color, #1976d2);
}
}
// Confidence badge styles
.confidence-badge {
display: inline-block;
padding: 2px 8px;
border-radius: 4px;
font-size: 0.75rem;
font-weight: 600;
text-transform: uppercase;
}
.confidence-high {
background-color: rgba(76, 175, 80, 0.15);
color: #2e7d32;
}
.confidence-medium {
background-color: rgba(255, 193, 7, 0.15);
color: #f57f17;
}
.confidence-low {
background-color: rgba(244, 67, 54, 0.15);
color: #c62828;
}
// Row highlighting for unreviewed medium/low
.row-low {
background-color: rgba(244, 67, 54, 0.05) !important;
}
.row-medium {
background-color: rgba(255, 193, 7, 0.05) !important;
}
// Unmapped items
.unmapped-section {
.unmapped-item {
border: 1px solid var(--border-color, #e0e0e0);
border-radius: 8px;
transition: background-color 0.2s ease;
&.resolved {
opacity: 0.7;
background-color: rgba(76, 175, 80, 0.05);
}
}
.resolution-badge {
font-size: 0.85rem;
color: var(--text-muted, #666);
font-style: italic;
}
.compact-field {
width: 160px;
input.mat-mdc-input-element {
font-size: 0.8125rem;
}
}
}
// Aggregation summary
.aggregation-section {
.aggregation-row {
border-bottom: 1px solid var(--border-color, #e0e0e0);
&:last-child {
border-bottom: none;
}
}
.aggregation-value {
font-size: 1.1rem;
}
}
// Alerts
.alert-danger {
background-color: rgba(244, 67, 54, 0.1);
border: 1px solid rgba(244, 67, 54, 0.3);
border-radius: 4px;
color: #f44336;
padding: 12px 16px;
}
.alert-warning {
background-color: rgba(255, 193, 7, 0.1);
border: 1px solid rgba(255, 193, 7, 0.3);
border-radius: 4px;
color: #e65100;
padding: 12px 16px;
}
.actions {
padding-bottom: 2rem;
}
// Type badge styling
.type-badge {
display: inline-block;
font-size: 0.7rem;
font-weight: 500;
padding: 2px 8px;
border-radius: 10px;
text-transform: capitalize;
white-space: nowrap;
}
.type-number {
background-color: #e3f2fd;
color: #1565c0;
}
.type-string {
background-color: #f3e5f5;
color: #7b1fa2;
}
.type-percentage {
background-color: #e8f5e9;
color: #2e7d32;
}
.type-boolean {
background-color: #fff3e0;
color: #e65100;
}
.type-select {
min-width: 110px;
}

4
apps/client/src/app/services/family-office-data.service.ts

@ -306,7 +306,7 @@ export class FamilyOfficeDataService {
type: string;
taxYear: number;
filingStatus?: string;
data: Record<string, number>;
data: Record<string, number | string | null>;
}): Observable<IKDocument> {
return this.http.post<IKDocument>('/api/v1/k-document', data);
}
@ -332,7 +332,7 @@ export class FamilyOfficeDataService {
public updateKDocument(
kDocumentId: string,
data: { filingStatus?: string; data?: Record<string, number> }
data: { filingStatus?: string; data?: Record<string, number | string | null> }
): Observable<IKDocument> {
return this.http.put<IKDocument>(`/api/v1/k-document/${kDocumentId}`, data);
}

212
apps/client/src/app/services/k1-import-data.service.ts

@ -0,0 +1,212 @@
import type {
K1ImportSessionSummary,
K1AggregationResult
} from '@ghostfolio/common/interfaces';
import type {
ConfirmK1ImportDto,
VerifyK1ImportDto
} from '@ghostfolio/common/dtos';
import { HttpClient, HttpParams } from '@angular/common/http';
import { Injectable } from '@angular/core';
import { Observable } from 'rxjs';
@Injectable({
providedIn: 'root'
})
export class K1ImportDataService {
public constructor(private http: HttpClient) {}
// ── K1 Import Endpoints ──────────────────────────────────────────
/**
* Upload a K-1 PDF and initiate extraction.
* POST /api/v1/k1-import/upload
*/
public uploadK1(formData: FormData): Observable<any> {
return this.http.post('/api/v1/k1-import/upload', formData);
}
/**
* Get the current state of an import session.
* GET /api/v1/k1-import/:id
*/
public fetchImportSession(sessionId: string): Observable<any> {
return this.http.get(`/api/v1/k1-import/${sessionId}`);
}
/**
* Submit user-verified extraction data.
* PUT /api/v1/k1-import/:id/verify
*/
public verifyImportSession(
sessionId: string,
data: VerifyK1ImportDto
): Observable<any> {
return this.http.put(`/api/v1/k1-import/${sessionId}/verify`, data);
}
/**
* Confirm verified data and trigger auto-creation of model objects.
* POST /api/v1/k1-import/:id/confirm
*/
public confirmImportSession(
sessionId: string,
data: ConfirmK1ImportDto
): Observable<any> {
return this.http.post(`/api/v1/k1-import/${sessionId}/confirm`, data);
}
/**
* Cancel an import session.
* POST /api/v1/k1-import/:id/cancel
*/
public cancelImportSession(sessionId: string): Observable<any> {
return this.http.post(`/api/v1/k1-import/${sessionId}/cancel`, {});
}
/**
* List import sessions for a partnership.
* GET /api/v1/k1-import/history
*/
public fetchImportHistory(params: {
partnershipId: string;
taxYear?: number;
}): Observable<K1ImportSessionSummary[]> {
let httpParams = new HttpParams().set(
'partnershipId',
params.partnershipId
);
if (params.taxYear) {
httpParams = httpParams.set('taxYear', params.taxYear.toString());
}
return this.http.get<K1ImportSessionSummary[]>(
'/api/v1/k1-import/history',
{ params: httpParams }
);
}
/**
* Re-run extraction on a previously uploaded PDF.
* POST /api/v1/k1-import/:id/reprocess
*/
public reprocessImportSession(sessionId: string): Observable<any> {
return this.http.post(`/api/v1/k1-import/${sessionId}/reprocess`, {});
}
// ── Cell Mapping Endpoints ───────────────────────────────────────
/**
* Get cell mappings for a partnership (with global defaults).
* GET /api/v1/cell-mapping
*/
public fetchCellMappings(partnershipId?: string): Observable<any[]> {
let httpParams = new HttpParams();
if (partnershipId) {
httpParams = httpParams.set('partnershipId', partnershipId);
}
return this.http.get<any[]>('/api/v1/cell-mapping', {
params: httpParams
});
}
/**
* Update or create cell mappings for a partnership.
* PUT /api/v1/cell-mapping
*/
public updateCellMappings(data: {
partnershipId: string;
mappings: Array<{
boxNumber: string;
label: string;
description?: string;
cellType?: string;
isCustom: boolean;
}>;
}): Observable<any[]> {
return this.http.put<any[]>('/api/v1/cell-mapping', data);
}
/**
* Reset a partnership's cell mappings to IRS defaults.
* DELETE /api/v1/cell-mapping/reset
*/
public resetCellMappings(partnershipId: string): Observable<void> {
const httpParams = new HttpParams().set('partnershipId', partnershipId);
return this.http.delete<void>('/api/v1/cell-mapping/reset', {
params: httpParams
});
}
/**
* Toggle the isIgnored flag for a cell mapping.
* PATCH /api/v1/cell-mapping/toggle-ignored
*/
public toggleFieldIgnored(data: {
partnershipId: string;
boxNumber: string;
}): Observable<any> {
return this.http.patch('/api/v1/cell-mapping/toggle-ignored', data);
}
// ── Aggregation Rule Endpoints ───────────────────────────────────
/**
* Get aggregation rules for a partnership.
* GET /api/v1/cell-mapping/aggregation-rules
*/
public fetchAggregationRules(partnershipId?: string): Observable<any[]> {
let httpParams = new HttpParams();
if (partnershipId) {
httpParams = httpParams.set('partnershipId', partnershipId);
}
return this.http.get<any[]>('/api/v1/cell-mapping/aggregation-rules', {
params: httpParams
});
}
/**
* Create or update aggregation rules for a partnership.
* PUT /api/v1/cell-mapping/aggregation-rules
*/
public updateAggregationRules(data: {
partnershipId: string;
rules: Array<{
name: string;
operation: string;
sourceCells: string[];
}>;
}): Observable<any[]> {
return this.http.put<any[]>(
'/api/v1/cell-mapping/aggregation-rules',
data
);
}
/**
* Compute aggregation values for a specific KDocument.
* GET /api/v1/cell-mapping/aggregation-rules/compute
*/
public computeAggregations(params: {
kDocumentId: string;
partnershipId?: string;
}): Observable<K1AggregationResult[]> {
let httpParams = new HttpParams().set('kDocumentId', params.kDocumentId);
if (params.partnershipId) {
httpParams = httpParams.set('partnershipId', params.partnershipId);
}
return this.http.get<K1AggregationResult[]>(
'/api/v1/cell-mapping/aggregation-rules/compute',
{ params: httpParams }
);
}
}

274
k1-positions-dump.txt

@ -0,0 +1,274 @@
Pages: 1
=== PAGE 1 ===
DATA | x= 524.2 | y= 758.7 | font=monospace | "651123"
TMPL | x= 511 | y= 748.4 | font=serif | "OMB No. 1545-0123"
DATA | x= 324.3 | y= 746.2 | font=sans-serif | "X"
TMPL | x= 336 | y= 746 | font=serif | "Final K-1"
TMPL | x= 415.2 | y= 746 | font=serif | "Amended K-1"
TMPL | x= 36 | y= 735.8 | font=serif | "Schedule K-1"
TMPL | x= 319.1 | y= 734.9 | font=serif | "Part III"
TMPL | x= 360 | y= 735.4 | font=serif | "PartnerΓÇÖs Share of Current Year Income,"
DATA | x= 236.8 | y= 727.7 | font=sans-serif | "20"
DATA | x= 262.1 | y= 727.7 | font=sans-serif | "25"
TMPL | x= 36 | y= 723.8 | font=serif | "(Form 1065)"
TMPL | x= 360 | y= 723.4 | font=serif | "Deductions, Credits, and Other Items"
TMPL | x= 36 | y= 713.5 | font=serif | "Department of the Treasury"
TMPL | x= 318.5 | y= 712 | font=serif | "1"
TMPL | x= 334.2 | y= 712 | font=serif | "Ordinary business income (loss)"
TMPL | x= 453.3 | y= 712 | font=serif | "14"
TMPL | x= 471 | y= 712 | font=serif | "Self-employment earnings (loss)"
TMPL | x= 36 | y= 705.5 | font=serif | "Internal Revenue Service"
TMPL | x= 193.2 | y= 703 | font=serif | "For calendar year 2025, or tax year"
TMPL | x= 71 | y= 686 | font=serif | "beginning"
TMPL | x= 129.6 | y= 687 | font=serif | "/"
DATA | x= 151.2 | y= 686.8 | font=sans-serif | "/"
DATA | x= 159 | y= 686.8 | font=sans-serif | "2025"
TMPL | x= 195.6 | y= 686 | font=serif | "ending"
TMPL | x= 244.8 | y= 687 | font=serif | "/"
TMPL | x= 266.4 | y= 687 | font=serif | "/"
TMPL | x= 318.5 | y= 688 | font=serif | "2"
TMPL | x= 333.2 | y= 688 | font=serif | "Net rental real estate income (loss)"
TMPL | x= 36 | y= 669.6 | font=serif | "PartnerΓÇÖs Share of Income, Deductions,"
TMPL | x= 318.5 | y= 664 | font=serif | "3"
TMPL | x= 334.2 | y= 664.1 | font=serif | "Other net rental income (loss)"
TMPL | x= 453.3 | y= 664 | font=serif | "15"
TMPL | x= 471 | y= 664 | font=serif | "Credits"
TMPL | x= 36 | y= 656.6 | font=serif | "Credits, etc."
TMPL | x= 215.2 | y= 656.8 | font=serif | "See separate instructions."
TMPL | x= 48.4 | y= 638.9 | font=serif | "Part I"
TMPL | x= 86.4 | y= 638.9 | font=serif | "Information About the Partnership"
TMPL | x= 316.4 | y= 640 | font=serif | "4a"
TMPL | x= 334.2 | y= 640 | font=serif | "Guaranteed payments for services"
TMPL | x= 40.8 | y= 626 | font=serif | "A"
TMPL | x= 316.3 | y= 616 | font=serif | "4b"
TMPL | x= 334.2 | y= 616 | font=serif | "Guaranteed payments for capital"
TMPL | x= 453.3 | y= 616 | font=serif | "16"
TMPL | x= 472 | y= 616 | font=serif | "Schedule K-3 is attached if"
TMPL | x= 472 | y= 606 | font=serif | "checked"
TMPL | x= 504 | y= 606 | font=serif | "."
TMPL | x= 516 | y= 606 | font=serif | "."
TMPL | x= 528 | y= 606 | font=serif | "."
TMPL | x= 540 | y= 606 | font=serif | "."
TMPL | x= 552 | y= 606 | font=serif | "."
TMPL | x= 40.7 | y= 602 | font=serif | "B"
DATA | x= 563.3 | y= 603.8 | font=sans-serif | "X"
TMPL | x= 316.4 | y= 592 | font=serif | "4c"
TMPL | x= 334.2 | y= 592 | font=serif | "Total guaranteed payments"
TMPL | x= 453.3 | y= 592 | font=serif | "17"
TMPL | x= 471 | y= 592 | font=serif | "Alternative minimum tax (AMT) items"
TMPL | x= 318.5 | y= 568 | font=serif | "5"
TMPL | x= 334.2 | y= 568 | font=serif | "Interest income"
TMPL | x= 40.6 | y= 554.5 | font=serif | "C"
TMPL | x= 58.4 | y= 554.5 | font=serif | "IRS center where partnership filed return:"
DATA | x= 185.4 | y= 553.7 | font=sans-serif | "E-FILE"
TMPL | x= 40.6 | y= 543 | font=serif | "D"
TMPL | x= 72 | y= 543 | font=serif | "Check if this is a publicly traded partnership (PTP)"
TMPL | x= 316.4 | y= 544 | font=serif | "6a"
TMPL | x= 334.2 | y= 544 | font=serif | "Ordinary dividends"
TMPL | x= 46.9 | y= 530.9 | font=serif | "Part II"
TMPL | x= 86.4 | y= 530.9 | font=serif | "Information About the Partner"
TMPL | x= 40.9 | y= 518 | font=serif | "E"
TMPL | x= 316.3 | y= 520 | font=serif | "6b"
TMPL | x= 334.2 | y= 520 | font=serif | "Qualified dividends"
TMPL | x= 453.3 | y= 520 | font=serif | "18"
TMPL | x= 471 | y= 520 | font=serif | "Tax-exempt income and"
TMPL | x= 471 | y= 511.6 | font=serif | "nondeductible expenses"
TMPL | x= 41.1 | y= 494 | font=serif | "F"
TMPL | x= 316.4 | y= 496 | font=serif | "6c"
TMPL | x= 334.2 | y= 496 | font=serif | "Dividend equivalents"
TMPL | x= 318.5 | y= 472 | font=serif | "7"
TMPL | x= 334.2 | y= 472 | font=serif | "Royalties"
TMPL | x= 40.5 | y= 447 | font=serif | "G"
TMPL | x= 72 | y= 446.6 | font=serif | "General partner or LLC"
DATA | x= 180.3 | y= 446.6 | font=sans-serif | "X"
TMPL | x= 194.4 | y= 446.6 | font=serif | "Limited partner or other LLC"
TMPL | x= 318.5 | y= 448 | font=serif | "8"
TMPL | x= 334.2 | y= 448 | font=serif | "Net short-term capital gain (loss)"
TMPL | x= 72 | y= 438.2 | font=serif | "member-manager"
TMPL | x= 194.4 | y= 438.2 | font=serif | "member"
TMPL | x= 453.3 | y= 436 | font=serif | "19"
TMPL | x= 471 | y= 436 | font=serif | "Distributions"
TMPL | x= 38.7 | y= 423 | font=serif | "H1"
DATA | x= 58 | y= 422.9 | font=sans-serif | "X"
TMPL | x= 72 | y= 422 | font=serif | "Domestic partner"
TMPL | x= 194.4 | y= 422 | font=serif | "Foreign partner"
TMPL | x= 316.4 | y= 424 | font=serif | "9a"
TMPL | x= 334.2 | y= 424 | font=serif | "Net long-term capital gain (loss)"
DATA | x= 455.2 | y= 423.2 | font=sans-serif | "A"
DATA | x= 530.6 | y= 422 | font=sans-serif | "4,493,757"
TMPL | x= 38.7 | y= 411 | font=serif | "H2"
DATA | x= 57.9 | y= 410.5 | font=sans-serif | "X"
TMPL | x= 72 | y= 410 | font=serif | "If the partner is a disregarded entity (DE), enter the partnerΓÇÖs:"
TMPL | x= 57.6 | y= 398.1 | font=serif | "TIN"
TMPL | x= 144 | y= 398.1 | font=serif | "Name"
TMPL | x= 316.3 | y= 400 | font=serif | "9b"
TMPL | x= 334.2 | y= 400 | font=serif | "Collectibles (28%) gain (loss)"
TMPL | x= 40.2 | y= 386 | font=serif | "I1"
TMPL | x= 57.6 | y= 387 | font=serif | "What type of entity is this partner?"
TMPL | x= 453.3 | y= 388 | font=serif | "20"
TMPL | x= 471 | y= 388 | font=serif | "Other information"
TMPL | x= 40.2 | y= 374 | font=serif | "I2"
TMPL | x= 57.6 | y= 374 | font=serif | "If this partner is a retirement plan (IRA/SEP/Keogh/etc.), check here"
TMPL | x= 276 | y= 374 | font=serif | "."
TMPL | x= 316.4 | y= 376 | font=serif | "9c"
TMPL | x= 334.2 | y= 376 | font=serif | "Unrecaptured section 1250 gain"
TMPL | x= 41.3 | y= 362 | font=serif | "J"
TMPL | x= 57.6 | y= 362 | font=serif | "PartnerΓÇÖs share of profit, loss, and capital (see instructions):"
DATA | x= 455.2 | y= 362.8 | font=sans-serif | "A"
DATA | x= 525.6 | y= 362.8 | font=sans-serif | "SEE STMT"
TMPL | x= 110 | y= 352.5 | font=serif | "Beginning"
TMPL | x= 229.8 | y= 352.5 | font=serif | "Ending"
TMPL | x= 316.5 | y= 352 | font=serif | "10"
TMPL | x= 334.2 | y= 352 | font=serif | "Net section 1231 gain (loss)"
TMPL | x= 57.6 | y= 338 | font=serif | "Profit"
DATA | x= 139.1 | y= 339.1 | font=sans-serif | "3.032900"
TMPL | x= 183.7 | y= 338 | font=serif | "%"
DATA | x= 250.1 | y= 339.1 | font=sans-serif | "0.000000"
TMPL | x= 291.7 | y= 338 | font=serif | "%"
DATA | x= 455.2 | y= 338.5 | font=sans-serif | "B"
DATA | x= 525.6 | y= 339 | font=sans-serif | "SEE STMT"
TMPL | x= 57.6 | y= 326 | font=serif | "Loss"
DATA | x= 139.1 | y= 326.1 | font=sans-serif | "3.032900"
TMPL | x= 183.7 | y= 326 | font=serif | "%"
DATA | x= 250.1 | y= 326.1 | font=sans-serif | "0.000000"
TMPL | x= 291.7 | y= 326 | font=serif | "%"
TMPL | x= 316.5 | y= 328 | font=serif | "11"
TMPL | x= 334.2 | y= 328 | font=serif | "Other income (loss)"
TMPL | x= 57.6 | y= 314.5 | font=serif | "Capital"
DATA | x= 139.1 | y= 314.2 | font=sans-serif | "3.032900"
TMPL | x= 183.7 | y= 314 | font=serif | "%"
DATA | x= 250.1 | y= 314.2 | font=sans-serif | "0.000000"
TMPL | x= 291.7 | y= 314 | font=serif | "%"
DATA | x= 314.2 | y= 314.4 | font=sans-serif | "ZZ*"
DATA | x= 403.9 | y= 314.4 | font=sans-serif | "(409,615)"
DATA | x= 455.2 | y= 315.6 | font=sans-serif | "V"
DATA | x= 525.6 | y= 314.6 | font=sans-serif | "SEE STMT"
TMPL | x= 57.6 | y= 302 | font=serif | "Check if decrease is due to:"
TMPL | x= 72 | y= 290 | font=serif | "Sale"
TMPL | x= 89.9 | y= 290 | font=serif | "or"
TMPL | x= 115.2 | y= 290 | font=serif | "Exchange of partnership interest. See instructions."
DATA | x= 456.4 | y= 291.3 | font=sans-serif | "*"
DATA | x= 525.6 | y= 290.3 | font=sans-serif | "SEE STMT"
TMPL | x= 38.7 | y= 278 | font=serif | "K1"
TMPL | x= 57.6 | y= 278 | font=serif | "PartnerΓÇÖs share of liabilities:"
TMPL | x= 316.5 | y= 280 | font=serif | "12"
TMPL | x= 334.2 | y= 280 | font=serif | "Section 179 deduction"
TMPL | x= 453.3 | y= 280 | font=serif | "21"
TMPL | x= 471 | y= 280 | font=serif | "Foreign taxes paid or accrued"
TMPL | x= 160.6 | y= 268.5 | font=serif | "Beginning"
TMPL | x= 251.2 | y= 268.5 | font=serif | "Ending"
DATA | x= 456.4 | y= 267.1 | font=sans-serif | "*"
DATA | x= 555.6 | y= 266.1 | font=sans-serif | "196"
TMPL | x= 57.6 | y= 254 | font=serif | "Nonrecourse"
TMPL | x= 108 | y= 254 | font=serif | "."
TMPL | x= 120 | y= 254 | font=serif | "."
TMPL | x= 134.9 | y= 254 | font=serif | "$"
DATA | x= 180.8 | y= 254.5 | font=sans-serif | "498,211"
TMPL | x= 221.3 | y= 254 | font=serif | "$"
TMPL | x= 316.5 | y= 256 | font=serif | "13"
TMPL | x= 334.2 | y= 256 | font=serif | "Other deductions"
TMPL | x= 57.6 | y= 238.4 | font=serif | "Qualified nonrecourse"
TMPL | x= 57.6 | y= 230 | font=serif | "financing"
TMPL | x= 96 | y= 230 | font=serif | "."
TMPL | x= 108 | y= 230 | font=serif | "."
TMPL | x= 120 | y= 230 | font=serif | "."
TMPL | x= 134.9 | y= 230 | font=serif | "$"
TMPL | x= 221.3 | y= 230 | font=serif | "$"
TMPL | x= 57.6 | y= 218.5 | font=serif | "Recourse"
TMPL | x= 96 | y= 218.5 | font=serif | "."
TMPL | x= 108 | y= 218.5 | font=serif | "."
TMPL | x= 120 | y= 218.5 | font=serif | "."
TMPL | x= 134.9 | y= 218.5 | font=serif | "$"
TMPL | x= 221.3 | y= 218.5 | font=serif | "$"
TMPL | x= 38.7 | y= 207 | font=serif | "K2"
TMPL | x= 57.6 | y= 207 | font=serif | "Check this box if item K1 includes liability amounts from lower-tier partnerships"
DATA | x= 294.9 | y= 205.8 | font=sans-serif | "X"
TMPL | x= 38.7 | y= 195 | font=serif | "K3"
TMPL | x= 57.6 | y= 195 | font=serif | "Check if any of the above liability is subject to guarantees or other"
TMPL | x= 57.6 | y= 186 | font=serif | "payment obligations by the partner. See instructions"
TMPL | x= 228 | y= 186 | font=serif | "."
TMPL | x= 240 | y= 186 | font=serif | "."
TMPL | x= 252 | y= 186 | font=serif | "."
TMPL | x= 264 | y= 186 | font=serif | "."
TMPL | x= 276 | y= 186 | font=serif | "."
TMPL | x= 316.2 | y= 183.1 | font=serif | "22"
TMPL | x= 345.6 | y= 183 | font=serif | "More than one activity for at-risk purposes*"
TMPL | x= 41.1 | y= 170 | font=serif | "L"
TMPL | x= 122.3 | y= 170.5 | font=serif | "PartnerΓÇÖs Capital Account Analysis"
TMPL | x= 316.2 | y= 171.1 | font=serif | "23"
TMPL | x= 345.6 | y= 171 | font=serif | "More than one activity for passive activity purposes*"
TMPL | x= 57.6 | y= 158 | font=serif | "Beginning capital account"
TMPL | x= 156 | y= 158 | font=serif | "."
TMPL | x= 168 | y= 158 | font=serif | "."
TMPL | x= 180 | y= 158 | font=serif | "."
TMPL | x= 189.5 | y= 157.6 | font=serif | "$"
DATA | x= 257.8 | y= 157.4 | font=sans-serif | "4,903,568"
TMPL | x= 316.6 | y= 158.6 | font=serif | "*See attached statement for additional information."
TMPL | x= 57.6 | y= 146 | font=serif | "Capital contributed during the year"
TMPL | x= 168 | y= 146 | font=serif | "."
TMPL | x= 180 | y= 146 | font=serif | "."
TMPL | x= 189.5 | y= 145.6 | font=serif | "$"
TMPL | x= 57.6 | y= 134 | font=serif | "Current year net income (loss)"
TMPL | x= 156 | y= 134 | font=serif | "."
TMPL | x= 168 | y= 134 | font=serif | "."
TMPL | x= 180 | y= 134 | font=serif | "."
TMPL | x= 189.5 | y= 133.6 | font=serif | "$"
DATA | x= 259.3 | y= 133.7 | font=sans-serif | "(409,811)"
TMPL | x= 57.6 | y= 122 | font=serif | "Other increase (decrease) (attach explanation)"
TMPL | x= 189.5 | y= 121.6 | font=serif | "$"
TMPL | x= 57.6 | y= 110 | font=serif | "Withdrawals and distributions"
TMPL | x= 156 | y= 110 | font=serif | "."
TMPL | x= 168 | y= 110 | font=serif | "."
TMPL | x= 180 | y= 110 | font=serif | "."
TMPL | x= 189.5 | y= 109.6 | font=serif | "$"
TMPL | x= 195.4 | y= 110.5 | font=serif | "("
DATA | x= 257.8 | y= 109.4 | font=sans-serif | "4,493,757"
TMPL | x= 300.4 | y= 110.5 | font=serif | ")"
TMPL | x= 57.6 | y= 99 | font=serif | "Ending capital account"
TMPL | x= 144 | y= 99 | font=serif | "."
TMPL | x= 156 | y= 99 | font=serif | "."
TMPL | x= 168 | y= 99 | font=serif | "."
TMPL | x= 180 | y= 99 | font=serif | "."
TMPL | x= 189.5 | y= 97.6 | font=serif | "$"
TMPL | x= 40 | y= 86 | font=serif | "M"
TMPL | x= 58.4 | y= 86 | font=serif | "Did the partner contribute property with a built-in gain (loss)?"
TMPL | x= 72 | y= 74 | font=serif | "Yes"
DATA | x= 101.2 | y= 74.2 | font=sans-serif | "X"
TMPL | x= 115.2 | y= 74 | font=serif | "No"
TMPL | x= 136.8 | y= 74 | font=serif | "If ΓÇ£Yes,ΓÇ¥ attach statement. See instructions."
TMPL | x= 40.6 | y= 62 | font=serif | "N"
TMPL | x= 70.1 | y= 62 | font=serif | "PartnerΓÇÖs Share of Net Unrecognized Section 704(c) Gain or (Loss)"
TMPL | x= 323 | y= 61.3 | font=serif | "For IRS Use Only"
TMPL | x= 57.6 | y= 51 | font=serif | "Beginning"
TMPL | x= 96 | y= 51 | font=serif | "."
TMPL | x= 108 | y= 51 | font=serif | "."
TMPL | x= 120 | y= 51 | font=serif | "."
TMPL | x= 132 | y= 51 | font=serif | "."
TMPL | x= 144 | y= 51 | font=serif | "."
TMPL | x= 156 | y= 51 | font=serif | "."
TMPL | x= 168 | y= 51 | font=serif | "."
TMPL | x= 180 | y= 51 | font=serif | "."
TMPL | x= 189.1 | y= 51 | font=serif | "$"
DATA | x= 271.5 | y= 49.7 | font=sans-serif | "(5,373)"
TMPL | x= 57.6 | y= 39 | font=serif | "Ending"
TMPL | x= 84 | y= 39 | font=serif | "."
TMPL | x= 96 | y= 39 | font=serif | "."
TMPL | x= 108 | y= 39 | font=serif | "."
TMPL | x= 120 | y= 39 | font=serif | "."
TMPL | x= 132 | y= 39 | font=serif | "."
TMPL | x= 144 | y= 39 | font=serif | "."
TMPL | x= 156 | y= 39 | font=serif | "."
TMPL | x= 168 | y= 39 | font=serif | "."
TMPL | x= 180 | y= 39 | font=serif | "."
TMPL | x= 189.1 | y= 39 | font=serif | "$"
TMPL | x= 36 | y= 26 | font=serif | "For Paperwork Reduction Act Notice, see the Instructions for Form 1065."
TMPL | x= 283.9 | y= 26 | font=serif | "www.irs.gov/Form1065"
TMPL | x= 362.7 | y= 26 | font=serif | "Cat. No. 11394R"
TMPL | x= 419.6 | y= 26 | font=serif | "Schedule K-1 (Form 1065) 2025"
TMPL | x= 524.9 | y= 26 | font=serif | "Created 2/26/25"
DATA | x= 285.6 | y= 5.5 | font=sans-serif | "Page 2 of 31"
DATA | x= 92.1 | y= 2.8 | font=sans-serif | "(409,811)"
Done.

14
libs/common/src/lib/dtos/index.ts

@ -14,6 +14,13 @@ import { CreateTagDto } from './create-tag.dto';
import { CreateWatchlistItemDto } from './create-watchlist-item.dto';
import { DeleteOwnUserDto } from './delete-own-user.dto';
import { CreateKDocumentDto, UpdateKDocumentDto } from './k-document.dto';
import {
ConfirmK1ImportDto,
CreateK1ImportDto,
K1ExtractedFieldDto,
K1UnmappedItemDto,
VerifyK1ImportDto
} from './k1-import.dto';
import {
CreatePartnershipAssetDto,
CreatePartnershipDto,
@ -37,6 +44,7 @@ import { UpdateUserSettingDto } from './update-user-setting.dto';
export {
AuthDeviceDto,
ConfirmK1ImportDto,
CreateAccessDto,
CreateAccountBalanceDto,
CreateAccountDto,
@ -45,6 +53,7 @@ export {
CreateAssetProfileWithMarketDataDto,
CreateDistributionDto,
CreateEntityDto,
CreateK1ImportDto,
CreateKDocumentDto,
CreateOrderDto,
CreateOwnershipDto,
@ -56,6 +65,8 @@ export {
CreateTagDto,
CreateWatchlistItemDto,
DeleteOwnUserDto,
K1ExtractedFieldDto,
K1UnmappedItemDto,
TransferBalanceDto,
UpdateAccessDto,
UpdateAccountDto,
@ -70,5 +81,6 @@ export {
UpdatePlatformDto,
UpdatePropertyDto,
UpdateTagDto,
UpdateUserSettingDto
UpdateUserSettingDto,
VerifyK1ImportDto
};

132
libs/common/src/lib/dtos/k1-import.dto.ts

@ -0,0 +1,132 @@
import { KDocumentStatus } from '@prisma/client';
import {
IsArray,
IsBoolean,
IsEnum,
IsInt,
IsNumber,
IsOptional,
IsString,
Min,
ValidateNested
} from 'class-validator';
import { Type } from 'class-transformer';
export class CreateK1ImportDto {
@IsString()
partnershipId: string;
@IsInt()
@Min(1900)
taxYear: number;
}
export class K1ExtractedFieldDto {
@IsString()
boxNumber: string;
@IsString()
label: string;
@IsOptional()
@IsString()
customLabel?: string;
@IsString()
rawValue: string;
@IsOptional()
@IsNumber()
numericValue?: number;
@IsNumber()
confidence: number;
@IsString()
confidenceLevel: 'HIGH' | 'MEDIUM' | 'LOW';
@IsBoolean()
isUserEdited: boolean;
@IsBoolean()
isReviewed: boolean;
@IsOptional()
@IsString()
cellType?: string;
@IsOptional()
@IsString()
subtype?: string | null;
@IsOptional()
@IsString()
fieldCategory?: string;
@IsOptional()
@IsBoolean()
isCheckbox?: boolean;
}
export class K1UnmappedItemDto {
@IsString()
rawLabel: string;
@IsString()
rawValue: string;
@IsOptional()
@IsNumber()
numericValue?: number;
@IsNumber()
confidence: number;
@IsInt()
pageNumber: number;
@IsString()
resolution: 'assigned' | 'discarded';
@IsOptional()
@IsString()
assignedBoxNumber?: string;
@IsOptional()
@IsNumber()
x?: number;
@IsOptional()
@IsNumber()
y?: number;
@IsOptional()
@IsString()
fontName?: string;
}
export class VerifyK1ImportDto {
@IsInt()
@Min(1900)
taxYear: number;
@IsArray()
@ValidateNested({ each: true })
@Type(() => K1ExtractedFieldDto)
fields: K1ExtractedFieldDto[];
@IsOptional()
@IsArray()
@ValidateNested({ each: true })
@Type(() => K1UnmappedItemDto)
unmappedItems?: K1UnmappedItemDto[];
}
export class ConfirmK1ImportDto {
@IsEnum(KDocumentStatus)
filingStatus: KDocumentStatus;
@IsOptional()
@IsString()
existingKDocumentAction?: 'UPDATE' | 'CREATE_NEW';
}

14
libs/common/src/lib/interfaces/index.ts

@ -54,6 +54,14 @@ import type {
IKDocumentAllocation,
K1Data
} from './k-document.interface';
import type {
K1AggregationResult,
K1ConfirmationRequest,
K1ExtractionResult,
K1ExtractedField,
K1ImportSessionSummary,
K1UnmappedItem
} from './k1-import.interface';
import type { LineChartItem } from './line-chart-item.interface';
import type { LookupItem } from './lookup-item.interface';
import type { MarketData } from './market-data.interface';
@ -192,6 +200,12 @@ export {
IKDocument,
IKDocumentAllocation,
IOwnership,
K1AggregationResult,
K1ConfirmationRequest,
K1ExtractionResult,
K1ExtractedField,
K1ImportSessionSummary,
K1UnmappedItem,
IPartnership,
IPartnershipAsset,
IPartnershipDetail,

134
libs/common/src/lib/interfaces/k1-import.interface.ts

@ -0,0 +1,134 @@
export interface K1ExtractionResult {
/** Extracted metadata from the K-1 header */
metadata: {
partnershipName: string | null;
partnershipEin: string | null;
partnerName: string | null;
partnerEin: string | null;
taxYear: number | null;
isAmended: boolean;
isFinal: boolean;
};
/** Extracted box values — mapped to known cells */
fields: K1ExtractedField[];
/** Extracted values that didn't match any configured cell mapping */
unmappedItems: K1UnmappedItem[];
/** Overall extraction confidence (0.0–1.0) */
overallConfidence: number;
/** Extraction method used */
method: 'pdf-parse' | 'azure' | 'tesseract';
/** Number of pages processed */
pagesProcessed: number;
}
export interface K1ExtractedField {
/** Box identifier (e.g., "1", "6a", "19a") */
boxNumber: string;
/** Display label from cell mapping */
label: string;
/** Custom label override by user (null if not overridden) */
customLabel: string | null;
/** Extracted raw text value */
rawValue: string;
/** Parsed numeric value (null if unparseable) */
numericValue: number | null;
/** Confidence score (0.0–1.0) */
confidence: number;
/** Confidence level for display */
confidenceLevel: 'HIGH' | 'MEDIUM' | 'LOW';
/** Whether user has manually edited this value */
isUserEdited: boolean;
/** Whether user has explicitly reviewed this field (required for medium/low confidence) */
isReviewed: boolean;
/** Subtype code for boxes that support them (e.g., "ZZ*", "A", "B", "*"). Null for simple boxes. */
subtype?: string | null;
/** Field category: PART_III, METADATA, SECTION_J, SECTION_K, SECTION_L, SECTION_M, SECTION_N, CHECKBOX */
fieldCategory?: string;
/** Whether this field is a boolean checkbox value */
isCheckbox?: boolean;
}
export interface K1UnmappedItem {
/** Raw text label extracted from the PDF */
rawLabel: string;
/** Raw text value extracted */
rawValue: string;
/** Parsed numeric value (null if unparseable) */
numericValue: number | null;
/** Confidence score (0.0–1.0) */
confidence: number;
/** Page number where this was extracted */
pageNumber: number;
/** User action: 'assigned' (to a cell), 'discarded', or null (pending) */
resolution: 'assigned' | 'discarded' | null;
/** If assigned, the box number it was assigned to */
assignedBoxNumber: string | null;
/** X position in PDF points */
x?: number;
/** Y position in PDF points */
y?: number;
/** PDF font identifier for debugging */
fontName?: string;
}
export interface K1ConfirmationRequest {
/** Import session ID */
importSessionId: string;
/** Tax year (may have been overridden by user) */
taxYear: number;
/** Filing status for the new KDocument */
filingStatus: 'DRAFT' | 'ESTIMATED' | 'FINAL';
/** Verified fields with any user edits applied */
fields: K1ExtractedField[];
/** Whether to update an existing KDocument (null = create new) */
existingKDocumentAction: 'UPDATE' | 'CREATE_NEW' | null;
}
export interface K1ImportSessionSummary {
id: string;
partnershipId: string;
status: string;
taxYear: number;
fileName: string;
extractionMethod: string;
kDocumentId: string | null;
createdAt: string;
}
export interface K1AggregationResult {
ruleId: string;
name: string;
operation: string;
sourceCells: string[];
computedValue: number;
breakdown: Record<string, number>;
}

587
libs/ui/src/lib/k-document-form/k-document-form.component.ts

@ -1,5 +1,3 @@
import type { K1Data } from '@ghostfolio/common/interfaces';
import { CommonModule } from '@angular/common';
import {
ChangeDetectionStrategy,
@ -9,123 +7,179 @@ import {
OnChanges,
Output
} from '@angular/core';
import {
FormControl,
FormGroup,
ReactiveFormsModule,
Validators
} from '@angular/forms';
import { FormsModule } from '@angular/forms';
import { MatButtonModule } from '@angular/material/button';
import { MatCheckboxModule } from '@angular/material/checkbox';
import { MatFormFieldModule } from '@angular/material/form-field';
import { MatIconModule } from '@angular/material/icon';
import { MatInputModule } from '@angular/material/input';
import { MatSelectModule } from '@angular/material/select';
import { MatTooltipModule } from '@angular/material/tooltip';
// ── Field types ──────────────────────────────────────────────────────────
type FieldType = 'currency' | 'percent' | 'text' | 'checkbox';
const K1_FIELD_CONFIG: {
key: keyof K1Data;
interface K1FieldDef {
boxNumber: string;
label: string;
section: string;
}[] = [
{
key: 'ordinaryIncome',
label: 'Ordinary Income (Box 1)',
section: 'Income'
},
{
key: 'netRentalIncome',
label: 'Net Rental Income (Box 2)',
section: 'Income'
},
{
key: 'otherRentalIncome',
label: 'Other Rental Income (Box 3)',
section: 'Income'
},
{
key: 'guaranteedPayments',
label: 'Guaranteed Payments (Box 4)',
section: 'Income'
},
{
key: 'interestIncome',
label: 'Interest Income (Box 5)',
section: 'Income'
},
{ key: 'dividends', label: 'Dividends (Box 6a)', section: 'Income' },
{
key: 'qualifiedDividends',
label: 'Qualified Dividends (Box 6b)',
section: 'Income'
},
{ key: 'royalties', label: 'Royalties (Box 7)', section: 'Income' },
{
key: 'capitalGainLossShortTerm',
label: 'Short-Term Capital Gain/Loss (Box 8)',
section: 'Capital'
},
type: FieldType;
}
interface K1Section {
title: string;
description?: string;
fields: K1FieldDef[];
collapsed?: boolean;
}
// ── Section definitions matching the real IRS Schedule K-1 ───────────────
const K1_SECTIONS: K1Section[] = [
{
key: 'capitalGainLossLongTerm',
label: 'Long-Term Capital Gain/Loss (Box 9a)',
section: 'Capital'
title: 'Header / Metadata',
fields: [
{ boxNumber: 'K1_DOCUMENT_ID', label: 'K-1 Document ID', type: 'text' },
{ boxNumber: 'TAX_YEAR', label: 'Tax Year', type: 'text' },
{ boxNumber: 'FINAL_K1', label: 'Final K-1', type: 'checkbox' },
{ boxNumber: 'AMENDED_K1', label: 'Amended K-1', type: 'checkbox' }
],
collapsed: true
},
{
key: 'unrecaptured1250Gain',
label: 'Unrecaptured Section 1250 Gain (Box 9b)',
section: 'Capital'
title: 'Part I — Partnership Information',
fields: [
{ boxNumber: 'A', label: "A — Partnership's EIN", type: 'text' },
{ boxNumber: 'B', label: "B — Partnership's name / address", type: 'text' },
{ boxNumber: 'C', label: 'C — IRS center where return filed', type: 'text' },
{ boxNumber: 'D', label: 'D — Publicly traded partnership', type: 'checkbox' }
],
collapsed: true
},
{
key: 'section1231GainLoss',
label: 'Section 1231 Gain/Loss (Box 10)',
section: 'Capital'
title: 'Part II — Partner Information',
fields: [
{ boxNumber: 'E', label: "E — Partner's identifying number", type: 'text' },
{ boxNumber: 'F', label: "F — Partner's name / address", type: 'text' },
{ boxNumber: 'G_GENERAL', label: 'G — General partner / LLC member-manager', type: 'checkbox' },
{ boxNumber: 'G_LIMITED', label: 'G — Limited partner / other LLC member', type: 'checkbox' },
{ boxNumber: 'H1_DOMESTIC', label: 'H1 — Domestic partner', type: 'checkbox' },
{ boxNumber: 'H1_FOREIGN', label: 'H1 — Foreign partner', type: 'checkbox' },
{ boxNumber: 'H2', label: 'H2 — Disregarded entity', type: 'checkbox' },
{ boxNumber: 'H2_TIN', label: 'H2 — DE taxpayer ID', type: 'text' },
{ boxNumber: 'I1', label: 'I1 — Type of entity', type: 'text' },
{ boxNumber: 'I2', label: 'I2 — IRA / SEP / Keogh', type: 'checkbox' }
],
collapsed: true
},
{ key: 'otherIncome', label: 'Other Income (Box 11)', section: 'Capital' },
{
key: 'section179Deduction',
label: 'Section 179 Deduction (Box 12)',
section: 'Deductions'
title: "Section J — Partner's Share of Profit, Loss & Capital",
fields: [
{ boxNumber: 'J_PROFIT_BEGIN', label: 'Profit — Beginning', type: 'percent' },
{ boxNumber: 'J_PROFIT_END', label: 'Profit — Ending', type: 'percent' },
{ boxNumber: 'J_LOSS_BEGIN', label: 'Loss — Beginning', type: 'percent' },
{ boxNumber: 'J_LOSS_END', label: 'Loss — Ending', type: 'percent' },
{ boxNumber: 'J_CAPITAL_BEGIN', label: 'Capital — Beginning', type: 'percent' },
{ boxNumber: 'J_CAPITAL_END', label: 'Capital — Ending', type: 'percent' },
{ boxNumber: 'J_SALE', label: 'Decrease due to sale', type: 'checkbox' },
{ boxNumber: 'J_EXCHANGE', label: 'Exchange of partnership interest', type: 'checkbox' }
]
},
{
key: 'otherDeductions',
label: 'Other Deductions (Box 13)',
section: 'Deductions'
title: "Section K — Partner's Share of Liabilities",
fields: [
{ boxNumber: 'K_NONRECOURSE_BEGIN', label: 'Nonrecourse — Beginning', type: 'currency' },
{ boxNumber: 'K_NONRECOURSE_END', label: 'Nonrecourse — Ending', type: 'currency' },
{ boxNumber: 'K_QUAL_NONRECOURSE_BEGIN', label: 'Qualified nonrecourse — Beginning', type: 'currency' },
{ boxNumber: 'K_QUAL_NONRECOURSE_END', label: 'Qualified nonrecourse — Ending', type: 'currency' },
{ boxNumber: 'K_RECOURSE_BEGIN', label: 'Recourse — Beginning', type: 'currency' },
{ boxNumber: 'K_RECOURSE_END', label: 'Recourse — Ending', type: 'currency' },
{ boxNumber: 'K2', label: 'Includes lower-tier partnership liabilities', type: 'checkbox' },
{ boxNumber: 'K3', label: 'Liability subject to guarantees', type: 'checkbox' }
]
},
{
key: 'selfEmploymentEarnings',
label: 'Self-Employment Earnings (Box 14)',
section: 'Other'
title: "Section L — Partner's Capital Account",
fields: [
{ boxNumber: 'L_BEG_CAPITAL', label: 'Beginning capital account', type: 'currency' },
{ boxNumber: 'L_CONTRIBUTED', label: 'Capital contributed during year', type: 'currency' },
{ boxNumber: 'L_CURR_YR_INCOME', label: 'Current year net income (loss)', type: 'currency' },
{ boxNumber: 'L_OTHER', label: 'Other increase (decrease)', type: 'currency' },
{ boxNumber: 'L_WITHDRAWALS', label: 'Withdrawals & distributions', type: 'currency' },
{ boxNumber: 'L_END_CAPITAL', label: 'Ending capital account', type: 'currency' }
]
},
{
key: 'foreignTaxesPaid',
label: 'Foreign Taxes Paid (Box 16)',
section: 'Other'
title: 'Sections M & N',
fields: [
{ boxNumber: 'M_YES', label: 'M — Contributed property: Yes', type: 'checkbox' },
{ boxNumber: 'M_NO', label: 'M — Contributed property: No', type: 'checkbox' },
{ boxNumber: 'N_BEGINNING', label: 'N — Net 704(c) gain/loss: Beginning', type: 'currency' },
{ boxNumber: 'N_ENDING', label: 'N — Net 704(c) gain/loss: Ending', type: 'currency' }
]
},
{
key: 'alternativeMinimumTaxItems',
label: 'AMT Items (Box 17)',
section: 'Other'
title: 'Part III — Income & Gains (Boxes 1–11)',
fields: [
{ boxNumber: '1', label: '1 — Ordinary business income (loss)', type: 'currency' },
{ boxNumber: '2', label: '2 — Net rental real estate income (loss)', type: 'currency' },
{ boxNumber: '3', label: '3 — Other net rental income (loss)', type: 'currency' },
{ boxNumber: '4', label: '4 — Guaranteed payments for services', type: 'currency' },
{ boxNumber: '4a', label: '4a — Guaranteed payments for capital', type: 'currency' },
{ boxNumber: '4b', label: '4b — Total guaranteed payments', type: 'currency' },
{ boxNumber: '5', label: '5 — Interest income', type: 'currency' },
{ boxNumber: '6a', label: '6a — Ordinary dividends', type: 'currency' },
{ boxNumber: '6b', label: '6b — Qualified dividends', type: 'currency' },
{ boxNumber: '6c', label: '6c — Dividend equivalents', type: 'currency' },
{ boxNumber: '7', label: '7 — Royalties', type: 'currency' },
{ boxNumber: '8', label: '8 — Net short-term capital gain (loss)', type: 'currency' },
{ boxNumber: '9a', label: '9a — Net long-term capital gain (loss)', type: 'currency' },
{ boxNumber: '9b', label: '9b — Collectibles (28%) gain (loss)', type: 'currency' },
{ boxNumber: '9c', label: '9c — Unrecaptured §1250 gain', type: 'currency' },
{ boxNumber: '10', label: '10 — Net §1231 gain (loss)', type: 'currency' },
{ boxNumber: '11', label: '11 — Other income (loss)', type: 'currency' }
]
},
{
key: 'distributionsCash',
label: 'Cash Distributions (Box 19a)',
section: 'Distributions'
title: 'Part III — Deductions & Credits (Boxes 12–18)',
fields: [
{ boxNumber: '12', label: '12 — §179 deduction', type: 'currency' },
{ boxNumber: '13', label: '13 — Other deductions', type: 'currency' },
{ boxNumber: '14', label: '14 — Self-employment earnings (loss)', type: 'currency' },
{ boxNumber: '15', label: '15 — Credits', type: 'currency' },
{ boxNumber: '16', label: '16 — Foreign transactions', type: 'currency' },
{ boxNumber: '16_K3', label: '16 — Schedule K-3 attached', type: 'checkbox' },
{ boxNumber: '17', label: '17 — AMT items', type: 'currency' },
{ boxNumber: '18', label: '18 — Tax-exempt income / nondeductible expenses', type: 'currency' }
]
},
{
key: 'distributionsProperty',
label: 'Property Distributions (Box 19b)',
section: 'Distributions'
title: 'Part III — Distributions & Other (Boxes 19–23)',
fields: [
{ boxNumber: '19', label: '19 — Distributions', type: 'currency' },
{ boxNumber: '19a', label: '19a — Cash & marketable securities', type: 'currency' },
{ boxNumber: '19b', label: '19b — Other property', type: 'currency' },
{ boxNumber: '20A', label: '20A — Other information: Code A', type: 'currency' },
{ boxNumber: '20B', label: '20B — Other information: Code B', type: 'currency' },
{ boxNumber: '20V', label: '20V — Other information: Code V', type: 'currency' },
{ boxNumber: '20_WILDCARD', label: '20 — Other information: Other codes', type: 'currency' },
{ boxNumber: '21', label: '21 — Foreign taxes paid or accrued', type: 'currency' },
{ boxNumber: '22', label: '22 — At-risk: more than one activity', type: 'checkbox' },
{ boxNumber: '23', label: '23 — Passive: more than one activity', type: 'checkbox' }
]
}
];
const SECTIONS = ['Income', 'Capital', 'Deductions', 'Other', 'Distributions'];
@Component({
changeDetection: ChangeDetectionStrategy.OnPush,
imports: [
CommonModule,
FormsModule,
MatButtonModule,
MatCheckboxModule,
MatFormFieldModule,
MatIconModule,
MatInputModule,
MatSelectModule,
ReactiveFormsModule
MatTooltipModule
],
selector: 'gf-k-document-form',
standalone: true,
@ -135,135 +189,342 @@ const SECTIONS = ['Income', 'Capital', 'Deductions', 'Other', 'Distributions'];
display: block;
}
.section-title {
font-size: 14px;
.form-header {
display: flex;
align-items: center;
justify-content: space-between;
flex-wrap: wrap;
gap: 12px;
margin-bottom: 20px;
}
/* Collapsible sections */
.k1-section {
margin-bottom: 12px;
border: 1px solid rgba(0, 0, 0, 0.08);
border-radius: 8px;
overflow: hidden;
}
.section-header {
display: flex;
align-items: center;
gap: 8px;
padding: 10px 16px;
background: rgba(0, 0, 0, 0.03);
cursor: pointer;
user-select: none;
font-weight: 500;
color: rgba(var(--dark-primary-text), 0.7);
margin: 16px 0 8px;
padding-bottom: 4px;
border-bottom: 1px solid rgba(var(--dark-dividers), 0.12);
font-size: 14px;
transition: background 0.15s;
}
.section-header:hover {
background: rgba(0, 0, 0, 0.06);
}
.section-header mat-icon {
font-size: 18px;
width: 18px;
height: 18px;
transition: transform 0.2s;
}
.section-header mat-icon.expanded {
transform: rotate(90deg);
}
.section-header .section-desc {
font-weight: 400;
font-size: 12px;
color: rgba(0, 0, 0, 0.5);
margin-left: auto;
}
.section-body {
padding: 12px 16px 4px;
}
/* Two-column grid */
.fields-grid {
display: grid;
grid-template-columns: repeat(auto-fill, minmax(280px, 1fr));
gap: 0 16px;
grid-template-columns: 1fr 1fr;
gap: 2px 24px;
}
@media (max-width: 700px) {
.fields-grid {
grid-template-columns: 1fr;
}
}
/* Field rows */
.field-row {
display: flex;
align-items: center;
gap: 8px;
padding: 4px 0;
min-height: 34px;
}
.field-label {
flex: 1 1 auto;
font-size: 13px;
color: rgba(0, 0, 0, 0.72);
line-height: 1.3;
min-width: 0;
}
.status-row {
.field-input {
flex: 0 0 140px;
display: flex;
gap: 16px;
margin-bottom: 16px;
align-items: center;
}
.field-input input {
width: 100%;
box-sizing: border-box;
padding: 5px 8px;
font-size: 13px;
font-family: 'Roboto Mono', monospace;
border: 1px solid rgba(0, 0, 0, 0.18);
border-radius: 4px;
background: transparent;
outline: none;
text-align: right;
transition: border-color 0.15s;
}
.field-input input:focus {
border-color: #1976d2;
box-shadow: 0 0 0 1px #1976d2;
}
.field-input input.text-input {
text-align: left;
font-family: inherit;
}
.field-input .unit-suffix {
font-size: 12px;
color: rgba(0, 0, 0, 0.45);
margin-left: 3px;
flex-shrink: 0;
}
.actions {
.field-input .unit-prefix {
font-size: 12px;
color: rgba(0, 0, 0, 0.45);
margin-right: 3px;
flex-shrink: 0;
}
.field-input input.is-zero {
color: rgba(0, 0, 0, 0.3);
}
/* Checkbox row */
.field-row-checkbox {
cursor: pointer;
}
.field-row-checkbox .cb-label {
font-size: 13px;
color: rgba(0, 0, 0, 0.72);
}
/* Footer */
.form-footer {
display: flex;
justify-content: flex-end;
gap: 8px;
margin-top: 16px;
margin-top: 20px;
padding-top: 12px;
border-top: 1px solid rgba(0, 0, 0, 0.08);
}
`
],
template: `
<form [formGroup]="form" (ngSubmit)="onSubmit()">
<div class="status-row">
<mat-form-field>
<mat-label>Filing Status</mat-label>
<mat-select formControlName="filingStatus">
<mat-option value="DRAFT">Draft</mat-option>
<mat-option value="ESTIMATED">Estimated</mat-option>
<mat-option value="FINAL">Final</mat-option>
</mat-select>
</mat-form-field>
</div>
<div class="form-header">
<mat-form-field style="min-width: 180px">
<mat-label>Filing Status</mat-label>
<mat-select [(ngModel)]="filingStatusValue">
<mat-option value="DRAFT">Draft</mat-option>
<mat-option value="ESTIMATED">Estimated</mat-option>
<mat-option value="FINAL">Final</mat-option>
</mat-select>
</mat-form-field>
</div>
@for (section of sections; track section) {
<div class="section-title">{{ section }}</div>
<div class="fields-grid">
@for (field of getFieldsForSection(section); track field.key) {
<mat-form-field>
<mat-label>{{ field.label }}</mat-label>
<input matInput type="number" [formControlName]="field.key" />
</mat-form-field>
@for (section of sections; track section.title) {
<div class="k1-section">
<div class="section-header" (click)="section.collapsed = !section.collapsed">
<mat-icon [class.expanded]="!section.collapsed">chevron_right</mat-icon>
<span>{{ section.title }}</span>
@if (section.description) {
<span class="section-desc">{{ section.description }}</span>
}
</div>
}
<div class="actions">
<button mat-button type="button" (click)="cancelled.emit()">
Cancel
</button>
<button
color="primary"
mat-flat-button
type="submit"
[disabled]="!form.valid"
>
{{ isEditMode ? 'Update' : 'Create' }}
</button>
@if (!section.collapsed) {
<div class="section-body">
<div class="fields-grid">
@for (field of section.fields; track field.boxNumber) {
@if (field.type === 'checkbox') {
<div class="field-row field-row-checkbox">
<mat-checkbox
[checked]="isChecked(field.boxNumber)"
(change)="setCheckbox(field.boxNumber, $event.checked)">
<span class="cb-label">{{ field.label }}</span>
</mat-checkbox>
</div>
} @else if (field.type === 'text') {
<div class="field-row">
<span class="field-label">{{ field.label }}</span>
<div class="field-input">
<input class="text-input"
[value]="getTextValue(field.boxNumber)"
(input)="setTextValue(field.boxNumber, $event)"
placeholder="—" />
</div>
</div>
} @else if (field.type === 'percent') {
<div class="field-row">
<span class="field-label">{{ field.label }}</span>
<div class="field-input">
<input type="number" step="any"
[value]="getNumericDisplay(field.boxNumber)"
[class.is-zero]="isZero(field.boxNumber)"
(input)="setNumericValue(field.boxNumber, $event)"
placeholder="0" />
<span class="unit-suffix">%</span>
</div>
</div>
} @else {
<div class="field-row">
<span class="field-label">{{ field.label }}</span>
<div class="field-input">
<span class="unit-prefix">$</span>
<input type="number" step="any"
[value]="getNumericDisplay(field.boxNumber)"
[class.is-zero]="isZero(field.boxNumber)"
(input)="setNumericValue(field.boxNumber, $event)"
placeholder="0" />
</div>
</div>
}
}
</div>
</div>
}
</div>
</form>
}
<div class="form-footer">
<button mat-button type="button" (click)="cancelled.emit()">Cancel</button>
<button mat-flat-button color="primary" (click)="onSubmit()">
{{ isEditMode ? 'Update' : 'Create' }}
</button>
</div>
`
})
export class GfKDocumentFormComponent implements OnChanges {
@Input() public data: K1Data | null = null;
@Input() public data: Record<string, number | string | null> | null = null;
@Input() public filingStatus: string = 'DRAFT';
@Input() public isEditMode: boolean = false;
@Output() public cancelled = new EventEmitter<void>();
@Output() public submitted = new EventEmitter<{
filingStatus: string;
data: Record<string, number>;
data: Record<string, number | string | null>;
}>();
public form: FormGroup;
public sections = SECTIONS;
public filingStatusValue = 'DRAFT';
public sections: K1Section[] = [];
/** Internal data store keyed by boxNumber */
private values: Record<string, number | string | null> = {};
public constructor() {
const controls: Record<string, FormControl> = {
filingStatus: new FormControl('DRAFT', Validators.required)
};
this.sections = K1_SECTIONS.map((s) => ({
...s,
fields: [...s.fields],
collapsed: s.collapsed ?? false
}));
}
public ngOnChanges(): void {
this.filingStatusValue = this.filingStatus || 'DRAFT';
for (const field of K1_FIELD_CONFIG) {
controls[field.key] = new FormControl(0);
if (this.data) {
this.values = { ...this.data };
} else {
this.values = {};
}
}
// ── Value accessors ────────────────────────────────────────────────────
this.form = new FormGroup(controls);
public isChecked(boxNumber: string): boolean {
const v = this.values[boxNumber];
return v === 'true' || v === 1 || v === '1';
}
public ngOnChanges(): void {
if (this.data) {
const patchData: Record<string, unknown> = {
filingStatus: this.filingStatus
};
public setCheckbox(boxNumber: string, checked: boolean): void {
this.values[boxNumber] = checked ? 'true' : 'false';
}
for (const field of K1_FIELD_CONFIG) {
patchData[field.key] = this.data[field.key] ?? 0;
}
public getTextValue(boxNumber: string): string {
const v = this.values[boxNumber];
return v != null ? String(v) : '';
}
this.form.patchValue(patchData);
public setTextValue(boxNumber: string, event: Event): void {
const input = event.target as HTMLInputElement;
this.values[boxNumber] = input.value || null;
}
public getNumericDisplay(boxNumber: string): string {
const v = this.values[boxNumber];
if (v == null || v === '') {
return '';
}
const n = Number(v);
return isNaN(n) ? '' : String(n);
}
public getFieldsForSection(
section: string
): { key: keyof K1Data; label: string; section: string }[] {
return K1_FIELD_CONFIG.filter((f) => f.section === section);
public isZero(boxNumber: string): boolean {
const v = this.values[boxNumber];
return v === 0 || v === '0';
}
public setNumericValue(boxNumber: string, event: Event): void {
const input = event.target as HTMLInputElement;
const raw = input.value;
if (raw === '' || raw == null) {
this.values[boxNumber] = null;
} else {
const n = parseFloat(raw);
this.values[boxNumber] = isNaN(n) ? null : n;
}
}
// ── Submit ─────────────────────────────────────────────────────────────
public onSubmit(): void {
if (this.form.valid) {
const value = this.form.value;
const data: Record<string, number> = {};
const data: Record<string, number | string | null> = {};
for (const field of K1_FIELD_CONFIG) {
data[field.key] = Number(value[field.key]) || 0;
for (const section of this.sections) {
for (const field of section.fields) {
const v = this.values[field.boxNumber];
if (v != null && v !== '') {
data[field.boxNumber] = v;
}
}
this.submitted.emit({
data,
filingStatus: value.filingStatus
});
}
this.submitted.emit({
data,
filingStatus: this.filingStatusValue
});
}
}

460
package-lock.json

@ -21,6 +21,7 @@
"@angular/platform-browser-dynamic": "21.1.1",
"@angular/router": "21.1.1",
"@angular/service-worker": "21.1.1",
"@azure/ai-form-recognizer": "^5.1.0",
"@bull-board/api": "6.20.3",
"@bull-board/express": "6.20.3",
"@bull-board/nestjs": "6.20.3",
@ -87,11 +88,13 @@
"passport-headerapikey": "1.2.2",
"passport-jwt": "4.0.1",
"passport-openidconnect": "0.1.2",
"pdf-parse": "^2.4.5",
"reflect-metadata": "0.2.2",
"rxjs": "7.8.1",
"stripe": "20.3.0",
"svgmap": "2.19.2",
"tablemark": "4.1.0",
"tesseract.js": "^7.0.0",
"twitter-api-v2": "1.29.0",
"yahoo-finance2": "3.13.2",
"zone.js": "0.16.0"
@ -137,6 +140,7 @@
"@types/papaparse": "5.3.7",
"@types/passport-google-oauth20": "2.0.17",
"@types/passport-openidconnect": "0.1.3",
"@types/pdf-parse": "^1.1.5",
"@typescript-eslint/eslint-plugin": "8.43.0",
"@typescript-eslint/parser": "8.43.0",
"eslint": "9.35.0",
@ -1698,6 +1702,154 @@
"license": "MIT",
"peer": true
},
"node_modules/@azure/abort-controller": {
"version": "2.1.2",
"resolved": "https://registry.npmjs.org/@azure/abort-controller/-/abort-controller-2.1.2.tgz",
"integrity": "sha512-nBrLsEWm4J2u5LpAPjxADTlq3trDgVZZXHNKabeXZtpq3d3AbN/KGO82R87rdDz5/lYB024rtEf10/q0urNgsA==",
"license": "MIT",
"dependencies": {
"tslib": "^2.6.2"
},
"engines": {
"node": ">=18.0.0"
}
},
"node_modules/@azure/ai-form-recognizer": {
"version": "5.1.0",
"resolved": "https://registry.npmjs.org/@azure/ai-form-recognizer/-/ai-form-recognizer-5.1.0.tgz",
"integrity": "sha512-XH6Nyj8+F/O3fH9RhHRUSSFkYMTJrDbw8F8M2mXm8jDkE06KQL0EDD9MTN9uLf+pZiYUWsEOQD9bPnLEtoh+lQ==",
"license": "MIT",
"dependencies": {
"@azure/abort-controller": "^2.1.2",
"@azure/core-auth": "^1.9.0",
"@azure/core-client": "^1.9.2",
"@azure/core-lro": "^2.2.0",
"@azure/core-paging": "^1.6.2",
"@azure/core-rest-pipeline": "^1.19.0",
"@azure/core-tracing": "^1.2.0",
"@azure/logger": "^1.1.4",
"tslib": "^2.8.1"
},
"engines": {
"node": ">=18.0.0"
}
},
"node_modules/@azure/core-auth": {
"version": "1.10.1",
"resolved": "https://registry.npmjs.org/@azure/core-auth/-/core-auth-1.10.1.tgz",
"integrity": "sha512-ykRMW8PjVAn+RS6ww5cmK9U2CyH9p4Q88YJwvUslfuMmN98w/2rdGRLPqJYObapBCdzBVeDgYWdJnFPFb7qzpg==",
"license": "MIT",
"dependencies": {
"@azure/abort-controller": "^2.1.2",
"@azure/core-util": "^1.13.0",
"tslib": "^2.6.2"
},
"engines": {
"node": ">=20.0.0"
}
},
"node_modules/@azure/core-client": {
"version": "1.10.1",
"resolved": "https://registry.npmjs.org/@azure/core-client/-/core-client-1.10.1.tgz",
"integrity": "sha512-Nh5PhEOeY6PrnxNPsEHRr9eimxLwgLlpmguQaHKBinFYA/RU9+kOYVOQqOrTsCL+KSxrLLl1gD8Dk5BFW/7l/w==",
"license": "MIT",
"dependencies": {
"@azure/abort-controller": "^2.1.2",
"@azure/core-auth": "^1.10.0",
"@azure/core-rest-pipeline": "^1.22.0",
"@azure/core-tracing": "^1.3.0",
"@azure/core-util": "^1.13.0",
"@azure/logger": "^1.3.0",
"tslib": "^2.6.2"
},
"engines": {
"node": ">=20.0.0"
}
},
"node_modules/@azure/core-lro": {
"version": "2.7.2",
"resolved": "https://registry.npmjs.org/@azure/core-lro/-/core-lro-2.7.2.tgz",
"integrity": "sha512-0YIpccoX8m/k00O7mDDMdJpbr6mf1yWo2dfmxt5A8XVZVVMz2SSKaEbMCeJRvgQ0IaSlqhjT47p4hVIRRy90xw==",
"license": "MIT",
"dependencies": {
"@azure/abort-controller": "^2.0.0",
"@azure/core-util": "^1.2.0",
"@azure/logger": "^1.0.0",
"tslib": "^2.6.2"
},
"engines": {
"node": ">=18.0.0"
}
},
"node_modules/@azure/core-paging": {
"version": "1.6.2",
"resolved": "https://registry.npmjs.org/@azure/core-paging/-/core-paging-1.6.2.tgz",
"integrity": "sha512-YKWi9YuCU04B55h25cnOYZHxXYtEvQEbKST5vqRga7hWY9ydd3FZHdeQF8pyh+acWZvppw13M/LMGx0LABUVMA==",
"license": "MIT",
"dependencies": {
"tslib": "^2.6.2"
},
"engines": {
"node": ">=18.0.0"
}
},
"node_modules/@azure/core-rest-pipeline": {
"version": "1.23.0",
"resolved": "https://registry.npmjs.org/@azure/core-rest-pipeline/-/core-rest-pipeline-1.23.0.tgz",
"integrity": "sha512-Evs1INHo+jUjwHi1T6SG6Ua/LHOQBCLuKEEE6efIpt4ZOoNonaT1kP32GoOcdNDbfqsD2445CPri3MubBy5DEQ==",
"license": "MIT",
"dependencies": {
"@azure/abort-controller": "^2.1.2",
"@azure/core-auth": "^1.10.0",
"@azure/core-tracing": "^1.3.0",
"@azure/core-util": "^1.13.0",
"@azure/logger": "^1.3.0",
"@typespec/ts-http-runtime": "^0.3.4",
"tslib": "^2.6.2"
},
"engines": {
"node": ">=20.0.0"
}
},
"node_modules/@azure/core-tracing": {
"version": "1.3.1",
"resolved": "https://registry.npmjs.org/@azure/core-tracing/-/core-tracing-1.3.1.tgz",
"integrity": "sha512-9MWKevR7Hz8kNzzPLfX4EAtGM2b8mr50HPDBvio96bURP/9C+HjdH3sBlLSNNrvRAr5/k/svoH457gB5IKpmwQ==",
"license": "MIT",
"dependencies": {
"tslib": "^2.6.2"
},
"engines": {
"node": ">=20.0.0"
}
},
"node_modules/@azure/core-util": {
"version": "1.13.1",
"resolved": "https://registry.npmjs.org/@azure/core-util/-/core-util-1.13.1.tgz",
"integrity": "sha512-XPArKLzsvl0Hf0CaGyKHUyVgF7oDnhKoP85Xv6M4StF/1AhfORhZudHtOyf2s+FcbuQ9dPRAjB8J2KvRRMUK2A==",
"license": "MIT",
"dependencies": {
"@azure/abort-controller": "^2.1.2",
"@typespec/ts-http-runtime": "^0.3.0",
"tslib": "^2.6.2"
},
"engines": {
"node": ">=20.0.0"
}
},
"node_modules/@azure/logger": {
"version": "1.3.0",
"resolved": "https://registry.npmjs.org/@azure/logger/-/logger-1.3.0.tgz",
"integrity": "sha512-fCqPIfOcLE+CGqGPd66c8bZpwAji98tZ4JI9i/mlTNTlsIWslCfpg48s/ypyLxZTump5sypjrKn2/kY7q8oAbA==",
"license": "MIT",
"dependencies": {
"@typespec/ts-http-runtime": "^0.3.0",
"tslib": "^2.6.2"
},
"engines": {
"node": ">=20.0.0"
}
},
"node_modules/@babel/code-frame": {
"version": "7.28.6",
"resolved": "https://registry.npmjs.org/@babel/code-frame/-/code-frame-7.28.6.tgz",
@ -7249,6 +7401,190 @@
"win32"
]
},
"node_modules/@napi-rs/canvas": {
"version": "0.1.80",
"resolved": "https://registry.npmjs.org/@napi-rs/canvas/-/canvas-0.1.80.tgz",
"integrity": "sha512-DxuT1ClnIPts1kQx8FBmkk4BQDTfI5kIzywAaMjQSXfNnra5UFU9PwurXrl+Je3bJ6BGsp/zmshVVFbCmyI+ww==",
"license": "MIT",
"workspaces": [
"e2e/*"
],
"engines": {
"node": ">= 10"
},
"optionalDependencies": {
"@napi-rs/canvas-android-arm64": "0.1.80",
"@napi-rs/canvas-darwin-arm64": "0.1.80",
"@napi-rs/canvas-darwin-x64": "0.1.80",
"@napi-rs/canvas-linux-arm-gnueabihf": "0.1.80",
"@napi-rs/canvas-linux-arm64-gnu": "0.1.80",
"@napi-rs/canvas-linux-arm64-musl": "0.1.80",
"@napi-rs/canvas-linux-riscv64-gnu": "0.1.80",
"@napi-rs/canvas-linux-x64-gnu": "0.1.80",
"@napi-rs/canvas-linux-x64-musl": "0.1.80",
"@napi-rs/canvas-win32-x64-msvc": "0.1.80"
}
},
"node_modules/@napi-rs/canvas-android-arm64": {
"version": "0.1.80",
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-android-arm64/-/canvas-android-arm64-0.1.80.tgz",
"integrity": "sha512-sk7xhN/MoXeuExlggf91pNziBxLPVUqF2CAVnB57KLG/pz7+U5TKG8eXdc3pm0d7Od0WreB6ZKLj37sX9muGOQ==",
"cpu": [
"arm64"
],
"license": "MIT",
"optional": true,
"os": [
"android"
],
"engines": {
"node": ">= 10"
}
},
"node_modules/@napi-rs/canvas-darwin-arm64": {
"version": "0.1.80",
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-darwin-arm64/-/canvas-darwin-arm64-0.1.80.tgz",
"integrity": "sha512-O64APRTXRUiAz0P8gErkfEr3lipLJgM6pjATwavZ22ebhjYl/SUbpgM0xcWPQBNMP1n29afAC/Us5PX1vg+JNQ==",
"cpu": [
"arm64"
],
"license": "MIT",
"optional": true,
"os": [
"darwin"
],
"engines": {
"node": ">= 10"
}
},
"node_modules/@napi-rs/canvas-darwin-x64": {
"version": "0.1.80",
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-darwin-x64/-/canvas-darwin-x64-0.1.80.tgz",
"integrity": "sha512-FqqSU7qFce0Cp3pwnTjVkKjjOtxMqRe6lmINxpIZYaZNnVI0H5FtsaraZJ36SiTHNjZlUB69/HhxNDT1Aaa9vA==",
"cpu": [
"x64"
],
"license": "MIT",
"optional": true,
"os": [
"darwin"
],
"engines": {
"node": ">= 10"
}
},
"node_modules/@napi-rs/canvas-linux-arm-gnueabihf": {
"version": "0.1.80",
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm-gnueabihf/-/canvas-linux-arm-gnueabihf-0.1.80.tgz",
"integrity": "sha512-eyWz0ddBDQc7/JbAtY4OtZ5SpK8tR4JsCYEZjCE3dI8pqoWUC8oMwYSBGCYfsx2w47cQgQCgMVRVTFiiO38hHQ==",
"cpu": [
"arm"
],
"license": "MIT",
"optional": true,
"os": [
"linux"
],
"engines": {
"node": ">= 10"
}
},
"node_modules/@napi-rs/canvas-linux-arm64-gnu": {
"version": "0.1.80",
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm64-gnu/-/canvas-linux-arm64-gnu-0.1.80.tgz",
"integrity": "sha512-qwA63t8A86bnxhuA/GwOkK3jvb+XTQaTiVML0vAWoHyoZYTjNs7BzoOONDgTnNtr8/yHrq64XXzUoLqDzU+Uuw==",
"cpu": [
"arm64"
],
"license": "MIT",
"optional": true,
"os": [
"linux"
],
"engines": {
"node": ">= 10"
}
},
"node_modules/@napi-rs/canvas-linux-arm64-musl": {
"version": "0.1.80",
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm64-musl/-/canvas-linux-arm64-musl-0.1.80.tgz",
"integrity": "sha512-1XbCOz/ymhj24lFaIXtWnwv/6eFHXDrjP0jYkc6iHQ9q8oXKzUX1Lc6bu+wuGiLhGh2GS/2JlfORC5ZcXimRcg==",
"cpu": [
"arm64"
],
"license": "MIT",
"optional": true,
"os": [
"linux"
],
"engines": {
"node": ">= 10"
}
},
"node_modules/@napi-rs/canvas-linux-riscv64-gnu": {
"version": "0.1.80",
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-riscv64-gnu/-/canvas-linux-riscv64-gnu-0.1.80.tgz",
"integrity": "sha512-XTzR125w5ZMs0lJcxRlS1K3P5RaZ9RmUsPtd1uGt+EfDyYMu4c6SEROYsxyatbbu/2+lPe7MPHOO/0a0x7L/gw==",
"cpu": [
"riscv64"
],
"license": "MIT",
"optional": true,
"os": [
"linux"
],
"engines": {
"node": ">= 10"
}
},
"node_modules/@napi-rs/canvas-linux-x64-gnu": {
"version": "0.1.80",
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-x64-gnu/-/canvas-linux-x64-gnu-0.1.80.tgz",
"integrity": "sha512-BeXAmhKg1kX3UCrJsYbdQd3hIMDH/K6HnP/pG2LuITaXhXBiNdh//TVVVVCBbJzVQaV5gK/4ZOCMrQW9mvuTqA==",
"cpu": [
"x64"
],
"license": "MIT",
"optional": true,
"os": [
"linux"
],
"engines": {
"node": ">= 10"
}
},
"node_modules/@napi-rs/canvas-linux-x64-musl": {
"version": "0.1.80",
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-x64-musl/-/canvas-linux-x64-musl-0.1.80.tgz",
"integrity": "sha512-x0XvZWdHbkgdgucJsRxprX/4o4sEed7qo9rCQA9ugiS9qE2QvP0RIiEugtZhfLH3cyI+jIRFJHV4Fuz+1BHHMg==",
"cpu": [
"x64"
],
"license": "MIT",
"optional": true,
"os": [
"linux"
],
"engines": {
"node": ">= 10"
}
},
"node_modules/@napi-rs/canvas-win32-x64-msvc": {
"version": "0.1.80",
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-win32-x64-msvc/-/canvas-win32-x64-msvc-0.1.80.tgz",
"integrity": "sha512-Z8jPsM6df5V8B1HrCHB05+bDiCxjE9QA//3YrkKIdVDEwn5RKaqOxCJDRJkl48cJbylcrJbW4HxZbTte8juuPg==",
"cpu": [
"x64"
],
"license": "MIT",
"optional": true,
"os": [
"win32"
],
"engines": {
"node": ">= 10"
}
},
"node_modules/@napi-rs/nice": {
"version": "1.1.1",
"resolved": "https://registry.npmjs.org/@napi-rs/nice/-/nice-1.1.1.tgz",
@ -13416,6 +13752,16 @@
"@types/passport": "*"
}
},
"node_modules/@types/pdf-parse": {
"version": "1.1.5",
"resolved": "https://registry.npmjs.org/@types/pdf-parse/-/pdf-parse-1.1.5.tgz",
"integrity": "sha512-kBfrSXsloMnUJOKi25s3+hRmkycHfLK6A09eRGqF/N8BkQoPUmaCr+q8Cli5FnfohEz/rsv82zAiPz/LXtOGhA==",
"dev": true,
"license": "MIT",
"dependencies": {
"@types/node": "*"
}
},
"node_modules/@types/qs": {
"version": "6.14.0",
"resolved": "https://registry.npmjs.org/@types/qs/-/qs-6.14.0.tgz",
@ -13823,6 +14169,20 @@
"url": "https://opencollective.com/eslint"
}
},
"node_modules/@typespec/ts-http-runtime": {
"version": "0.3.4",
"resolved": "https://registry.npmjs.org/@typespec/ts-http-runtime/-/ts-http-runtime-0.3.4.tgz",
"integrity": "sha512-CI0NhTrz4EBaa0U+HaaUZrJhPoso8sG7ZFya8uQoBA57fjzrjRSv87ekCjLZOFExN+gXE/z0xuN2QfH4H2HrLQ==",
"license": "MIT",
"dependencies": {
"http-proxy-agent": "^7.0.0",
"https-proxy-agent": "^7.0.0",
"tslib": "^2.6.2"
},
"engines": {
"node": ">=20.0.0"
}
},
"node_modules/@ungap/structured-clone": {
"version": "1.3.0",
"resolved": "https://registry.npmjs.org/@ungap/structured-clone/-/structured-clone-1.3.0.tgz",
@ -15609,6 +15969,12 @@
"readable-stream": "^3.4.0"
}
},
"node_modules/bmp-js": {
"version": "0.1.0",
"resolved": "https://registry.npmjs.org/bmp-js/-/bmp-js-0.1.0.tgz",
"integrity": "sha512-vHdS19CnY3hwiNdkaqk93DvjVLfbEcI8mys4UjuWrlX1haDmroo8o4xCzh4wD6DGV6HxRCyauwhHRqMTfERtjw==",
"license": "MIT"
},
"node_modules/body-parser": {
"version": "2.2.1",
"resolved": "https://registry.npmjs.org/body-parser/-/body-parser-2.2.1.tgz",
@ -22017,7 +22383,6 @@
"version": "7.0.2",
"resolved": "https://registry.npmjs.org/http-proxy-agent/-/http-proxy-agent-7.0.2.tgz",
"integrity": "sha512-T1gkAiYYDWYx3V5Bmyu7HcfcvL7mUrTWiM6yOfa3PIphViJ/gFPbvidQ+veqSOHci/PxBcDabeUNCzpOODJZig==",
"dev": true,
"license": "MIT",
"dependencies": {
"agent-base": "^7.1.0",
@ -22182,6 +22547,12 @@
"postcss": "^8.1.0"
}
},
"node_modules/idb-keyval": {
"version": "6.2.2",
"resolved": "https://registry.npmjs.org/idb-keyval/-/idb-keyval-6.2.2.tgz",
"integrity": "sha512-yjD9nARJ/jb1g+CvD0tlhUHOrJ9Sy0P8T9MF3YaLlHnSRpwPfpTX0XIvpmw3gAJUmEu3FiICLBDPXVwyEvrleg==",
"license": "Apache-2.0"
},
"node_modules/identity-obj-proxy": {
"version": "3.0.0",
"resolved": "https://registry.npmjs.org/identity-obj-proxy/-/identity-obj-proxy-3.0.0.tgz",
@ -22971,7 +23342,6 @@
"version": "1.2.4",
"resolved": "https://registry.npmjs.org/is-url/-/is-url-1.2.4.tgz",
"integrity": "sha512-ITvGim8FhRiYe4IQ5uHSkj7pVaPDrCTkNd3yq3cV7iZAcJdHTUMPMEHcqSOy9xZ9qFenQCvi+2wjH9a1nXqHww==",
"dev": true,
"license": "MIT"
},
"node_modules/is-weakmap": {
@ -27361,6 +27731,15 @@
"integrity": "sha512-vCseG/EQ6/RcvxhUcGJiHViOgrtz4x0XbZepXvKik66TMGkvbmjeJrKFyBEx6daG5rNyyd14zYXhz0hZVwQFOw==",
"license": "MIT"
},
"node_modules/opencollective-postinstall": {
"version": "2.0.3",
"resolved": "https://registry.npmjs.org/opencollective-postinstall/-/opencollective-postinstall-2.0.3.tgz",
"integrity": "sha512-8AV/sCtuzUeTo8gQK5qDZzARrulB3egtLzFgteqB2tcT4Mw7B8Kt7JcDHmltjz6FOAHsvTevk70gZEbhM4ZS9Q==",
"license": "MIT",
"bin": {
"opencollective-postinstall": "index.js"
}
},
"node_modules/opener": {
"version": "1.5.2",
"resolved": "https://registry.npmjs.org/opener/-/opener-1.5.2.tgz",
@ -28179,6 +28558,38 @@
"resolved": "https://registry.npmjs.org/pause/-/pause-0.0.1.tgz",
"integrity": "sha512-KG8UEiEVkR3wGEb4m5yZkVCzigAD+cVEJck2CzYZO37ZGJfctvVptVO192MwrtPhzONn6go8ylnOdMhKqi4nfg=="
},
"node_modules/pdf-parse": {
"version": "2.4.5",
"resolved": "https://registry.npmjs.org/pdf-parse/-/pdf-parse-2.4.5.tgz",
"integrity": "sha512-mHU89HGh7v+4u2ubfnevJ03lmPgQ5WU4CxAVmTSh/sxVTEDYd1er/dKS/A6vg77NX47KTEoihq8jZBLr8Cxuwg==",
"license": "Apache-2.0",
"dependencies": {
"@napi-rs/canvas": "0.1.80",
"pdfjs-dist": "5.4.296"
},
"bin": {
"pdf-parse": "bin/cli.mjs"
},
"engines": {
"node": ">=20.16.0 <21 || >=22.3.0"
},
"funding": {
"type": "github",
"url": "https://github.com/sponsors/mehmet-kozan"
}
},
"node_modules/pdfjs-dist": {
"version": "5.4.296",
"resolved": "https://registry.npmjs.org/pdfjs-dist/-/pdfjs-dist-5.4.296.tgz",
"integrity": "sha512-DlOzet0HO7OEnmUmB6wWGJrrdvbyJKftI1bhMitK7O2N8W2gc757yyYBbINy9IDafXAV9wmKr9t7xsTaNKRG5Q==",
"license": "Apache-2.0",
"engines": {
"node": ">=20.16.0 || >=22.3.0"
},
"optionalDependencies": {
"@napi-rs/canvas": "^0.1.80"
}
},
"node_modules/perfect-debounce": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/perfect-debounce/-/perfect-debounce-1.0.0.tgz",
@ -29674,6 +30085,12 @@
"node": ">=4"
}
},
"node_modules/regenerator-runtime": {
"version": "0.13.11",
"resolved": "https://registry.npmjs.org/regenerator-runtime/-/regenerator-runtime-0.13.11.tgz",
"integrity": "sha512-kY1AZVr2Ra+t+piVaJ4gxaFaReZVH40AKNo7UCX6W+dEwBo/2oZJzqfuN1qLq1oL45o56cPaTXELwrTh8Fpggg==",
"license": "MIT"
},
"node_modules/regex-parser": {
"version": "2.3.1",
"resolved": "https://registry.npmjs.org/regex-parser/-/regex-parser-2.3.1.tgz",
@ -32749,6 +33166,30 @@
"devOptional": true,
"license": "MIT"
},
"node_modules/tesseract.js": {
"version": "7.0.0",
"resolved": "https://registry.npmjs.org/tesseract.js/-/tesseract.js-7.0.0.tgz",
"integrity": "sha512-exPBkd+z+wM1BuMkx/Bjv43OeLBxhL5kKWsz/9JY+DXcXdiBjiAch0V49QR3oAJqCaL5qURE0vx9Eo+G5YE7mA==",
"hasInstallScript": true,
"license": "Apache-2.0",
"dependencies": {
"bmp-js": "^0.1.0",
"idb-keyval": "^6.2.0",
"is-url": "^1.2.4",
"node-fetch": "^2.6.9",
"opencollective-postinstall": "^2.0.3",
"regenerator-runtime": "^0.13.3",
"tesseract.js-core": "^7.0.0",
"wasm-feature-detect": "^1.8.0",
"zlibjs": "^0.3.1"
}
},
"node_modules/tesseract.js-core": {
"version": "7.0.0",
"resolved": "https://registry.npmjs.org/tesseract.js-core/-/tesseract.js-core-7.0.0.tgz",
"integrity": "sha512-WnNH518NzmbSq9zgTPeoF8c+xmilS8rFIl1YKbk/ptuuc7p6cLNELNuPAzcmsYw450ca6bLa8j3t0VAtq435Vw==",
"license": "Apache-2.0"
},
"node_modules/test-exclude": {
"version": "6.0.0",
"resolved": "https://registry.npmjs.org/test-exclude/-/test-exclude-6.0.0.tgz",
@ -34279,6 +34720,12 @@
"makeerror": "1.0.12"
}
},
"node_modules/wasm-feature-detect": {
"version": "1.8.0",
"resolved": "https://registry.npmjs.org/wasm-feature-detect/-/wasm-feature-detect-1.8.0.tgz",
"integrity": "sha512-zksaLKM2fVlnB5jQQDqKXXwYHLQUVH9es+5TOOHwGOVJOCeRBCiPjwSg+3tN2AdTCzjgli4jijCH290kXb/zWQ==",
"license": "Apache-2.0"
},
"node_modules/watchpack": {
"version": "2.5.0",
"resolved": "https://registry.npmjs.org/watchpack/-/watchpack-2.5.0.tgz",
@ -35723,6 +36170,15 @@
"url": "https://github.com/sponsors/sindresorhus"
}
},
"node_modules/zlibjs": {
"version": "0.3.1",
"resolved": "https://registry.npmjs.org/zlibjs/-/zlibjs-0.3.1.tgz",
"integrity": "sha512-+J9RrgTKOmlxFSDHo0pI1xM6BLVUv+o0ZT9ANtCxGkjIVCCUdx9alUF8Gm+dGLKbkkkidWIHFDZHDMpfITt4+w==",
"license": "MIT",
"engines": {
"node": "*"
}
},
"node_modules/zod": {
"version": "3.25.76",
"resolved": "https://registry.npmjs.org/zod/-/zod-3.25.76.tgz",

4
package.json

@ -66,6 +66,7 @@
"@angular/platform-browser-dynamic": "21.1.1",
"@angular/router": "21.1.1",
"@angular/service-worker": "21.1.1",
"@azure/ai-form-recognizer": "^5.1.0",
"@bull-board/api": "6.20.3",
"@bull-board/express": "6.20.3",
"@bull-board/nestjs": "6.20.3",
@ -132,11 +133,13 @@
"passport-headerapikey": "1.2.2",
"passport-jwt": "4.0.1",
"passport-openidconnect": "0.1.2",
"pdf-parse": "^2.4.5",
"reflect-metadata": "0.2.2",
"rxjs": "7.8.1",
"stripe": "20.3.0",
"svgmap": "2.19.2",
"tablemark": "4.1.0",
"tesseract.js": "^7.0.0",
"twitter-api-v2": "1.29.0",
"yahoo-finance2": "3.13.2",
"zone.js": "0.16.0"
@ -182,6 +185,7 @@
"@types/papaparse": "5.3.7",
"@types/passport-google-oauth20": "2.0.17",
"@types/passport-openidconnect": "0.1.3",
"@types/pdf-parse": "^1.1.5",
"@typescript-eslint/eslint-plugin": "8.43.0",
"@typescript-eslint/parser": "8.43.0",
"eslint": "9.35.0",

93
prisma/migrations/20260321000000_added_k1_import_tables/migration.sql

@ -0,0 +1,93 @@
-- CreateEnum
CREATE TYPE "K1ImportStatus" AS ENUM ('PROCESSING', 'EXTRACTED', 'VERIFIED', 'CONFIRMED', 'CANCELLED', 'FAILED');
-- CreateTable
CREATE TABLE "K1ImportSession" (
"id" TEXT NOT NULL,
"partnershipId" TEXT NOT NULL,
"userId" TEXT NOT NULL,
"status" "K1ImportStatus" NOT NULL DEFAULT 'PROCESSING',
"taxYear" INTEGER NOT NULL,
"fileName" TEXT NOT NULL,
"fileSize" INTEGER NOT NULL,
"extractionMethod" TEXT NOT NULL,
"rawExtraction" JSONB,
"verifiedData" JSONB,
"documentId" TEXT,
"kDocumentId" TEXT,
"errorMessage" TEXT,
"createdAt" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
"updatedAt" TIMESTAMP(3) NOT NULL,
CONSTRAINT "K1ImportSession_pkey" PRIMARY KEY ("id")
);
-- CreateTable
CREATE TABLE "CellMapping" (
"id" TEXT NOT NULL,
"partnershipId" TEXT,
"boxNumber" TEXT NOT NULL,
"label" TEXT NOT NULL,
"description" TEXT,
"cellType" TEXT NOT NULL DEFAULT 'number',
"isCustom" BOOLEAN NOT NULL DEFAULT false,
"isIgnored" BOOLEAN NOT NULL DEFAULT false,
"sortOrder" INTEGER NOT NULL,
"createdAt" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
"updatedAt" TIMESTAMP(3) NOT NULL,
CONSTRAINT "CellMapping_pkey" PRIMARY KEY ("id")
);
-- CreateTable
CREATE TABLE "CellAggregationRule" (
"id" TEXT NOT NULL,
"partnershipId" TEXT,
"name" TEXT NOT NULL,
"operation" TEXT NOT NULL DEFAULT 'SUM',
"sourceCells" JSONB NOT NULL,
"sortOrder" INTEGER NOT NULL,
"createdAt" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
"updatedAt" TIMESTAMP(3) NOT NULL,
CONSTRAINT "CellAggregationRule_pkey" PRIMARY KEY ("id")
);
-- CreateIndex
CREATE INDEX "K1ImportSession_partnershipId_taxYear_idx" ON "K1ImportSession"("partnershipId", "taxYear");
-- CreateIndex
CREATE INDEX "K1ImportSession_userId_idx" ON "K1ImportSession"("userId");
-- CreateIndex
CREATE UNIQUE INDEX "K1ImportSession_kDocumentId_key" ON "K1ImportSession"("kDocumentId");
-- CreateIndex
CREATE INDEX "CellMapping_partnershipId_idx" ON "CellMapping"("partnershipId");
-- CreateIndex
CREATE UNIQUE INDEX "CellMapping_partnershipId_boxNumber_key" ON "CellMapping"("partnershipId", "boxNumber");
-- CreateIndex
CREATE INDEX "CellAggregationRule_partnershipId_idx" ON "CellAggregationRule"("partnershipId");
-- CreateIndex
CREATE UNIQUE INDEX "CellAggregationRule_partnershipId_name_key" ON "CellAggregationRule"("partnershipId", "name");
-- AddForeignKey
ALTER TABLE "K1ImportSession" ADD CONSTRAINT "K1ImportSession_partnershipId_fkey" FOREIGN KEY ("partnershipId") REFERENCES "Partnership"("id") ON DELETE CASCADE ON UPDATE CASCADE;
-- AddForeignKey
ALTER TABLE "K1ImportSession" ADD CONSTRAINT "K1ImportSession_userId_fkey" FOREIGN KEY ("userId") REFERENCES "User"("id") ON DELETE CASCADE ON UPDATE CASCADE;
-- AddForeignKey
ALTER TABLE "K1ImportSession" ADD CONSTRAINT "K1ImportSession_documentId_fkey" FOREIGN KEY ("documentId") REFERENCES "Document"("id") ON DELETE SET NULL ON UPDATE CASCADE;
-- AddForeignKey
ALTER TABLE "K1ImportSession" ADD CONSTRAINT "K1ImportSession_kDocumentId_fkey" FOREIGN KEY ("kDocumentId") REFERENCES "KDocument"("id") ON DELETE SET NULL ON UPDATE CASCADE;
-- AddForeignKey
ALTER TABLE "CellMapping" ADD CONSTRAINT "CellMapping_partnershipId_fkey" FOREIGN KEY ("partnershipId") REFERENCES "Partnership"("id") ON DELETE CASCADE ON UPDATE CASCADE;
-- AddForeignKey
ALTER TABLE "CellAggregationRule" ADD CONSTRAINT "CellAggregationRule_partnershipId_fkey" FOREIGN KEY ("partnershipId") REFERENCES "Partnership"("id") ON DELETE CASCADE ON UPDATE CASCADE;

104
prisma/schema.prisma

@ -282,6 +282,7 @@ model User {
updatedAt DateTime @updatedAt
watchlist SymbolProfile[] @relation("UserWatchlist")
SymbolProfile SymbolProfile[]
k1ImportSessions K1ImportSession[]
@@index([accessToken])
@@index([createdAt])
@ -468,8 +469,11 @@ model Partnership {
assets PartnershipAsset[]
valuations PartnershipValuation[]
distributions Distribution[]
kDocuments KDocument[]
documents Document[]
kDocuments KDocument[]
documents Document[]
importSessions K1ImportSession[]
cellMappings CellMapping[]
aggregationRules CellAggregationRule[]
@@index([name])
@@index([type])
@ -537,17 +541,21 @@ model Distribution {
}
model KDocument {
id String @id @default(uuid())
partnershipId String
partnership Partnership @relation(fields: [partnershipId], onDelete: Cascade, references: [id])
type KDocumentType
taxYear Int
filingStatus KDocumentStatus @default(DRAFT)
data Json
documentFileId String?
documentFile Document? @relation(fields: [documentFileId], references: [id])
createdAt DateTime @default(now())
updatedAt DateTime @updatedAt
id String @id @default(uuid())
partnershipId String
partnership Partnership @relation(fields: [partnershipId], onDelete: Cascade, references: [id])
type KDocumentType
taxYear Int
filingStatus KDocumentStatus @default(DRAFT)
data Json
previousData Json?
previousFilingStatus KDocumentStatus?
documentFileId String?
documentFile Document? @relation(fields: [documentFileId], references: [id])
createdAt DateTime @default(now())
updatedAt DateTime @updatedAt
importSession K1ImportSession?
@@unique([partnershipId, type, taxYear])
@@index([partnershipId])
@ -620,8 +628,76 @@ model Document {
taxYear Int?
createdAt DateTime @default(now())
updatedAt DateTime @updatedAt
kDocuments KDocument[]
kDocuments KDocument[]
k1ImportSessions K1ImportSession[]
@@index([entityId])
@@index([partnershipId])
}
enum K1ImportStatus {
PROCESSING
EXTRACTED
VERIFIED
CONFIRMED
CANCELLED
FAILED
}
model K1ImportSession {
id String @id @default(uuid())
partnershipId String
partnership Partnership @relation(fields: [partnershipId], onDelete: Cascade, references: [id])
userId String
user User @relation(fields: [userId], onDelete: Cascade, references: [id])
status K1ImportStatus @default(PROCESSING)
taxYear Int
fileName String
fileSize Int
extractionMethod String
rawExtraction Json?
verifiedData Json?
documentId String?
document Document? @relation(fields: [documentId], references: [id])
kDocumentId String? @unique
kDocument KDocument? @relation(fields: [kDocumentId], references: [id])
errorMessage String?
createdAt DateTime @default(now())
updatedAt DateTime @updatedAt
@@index([partnershipId, taxYear])
@@index([userId])
}
model CellMapping {
id String @id @default(uuid())
partnershipId String?
partnership Partnership? @relation(fields: [partnershipId], onDelete: Cascade, references: [id])
boxNumber String
label String
description String?
cellType String @default("number")
isCustom Boolean @default(false)
isIgnored Boolean @default(false)
sortOrder Int
createdAt DateTime @default(now())
updatedAt DateTime @updatedAt
@@unique([partnershipId, boxNumber])
@@index([partnershipId])
}
model CellAggregationRule {
id String @id @default(uuid())
partnershipId String?
partnership Partnership? @relation(fields: [partnershipId], onDelete: Cascade, references: [id])
name String
operation String @default("SUM")
sourceCells Json
sortOrder Int
createdAt DateTime @default(now())
updatedAt DateTime @updatedAt
@@unique([partnershipId, name])
@@index([partnershipId])
}

535
specs/001-family-office-transform/research-normalized-k1-model.md

@ -0,0 +1,535 @@
# Research: Normalized Relational Model for K-1 Financial Data
**Phase 0 Output** | **Date**: 2026-03-20 | **Research Only — No Code**
---
## Context
The current system stores K-1 box data as a flat JSON blob on `KDocument.data`:
```json
{"1": 50000, "9a": -1200, "11-ZZ*": 500, "20-A": 1200}
```
Aggregations are computed on-the-fly in `k1-aggregation.service.ts` by iterating JSON keys. `CellMapping` provides label metadata, and `CellAggregationRule` defines which box keys to SUM. The system currently has ~80+ possible K-1 fields (boxes 1–21 with subtypes, Sections J/K/L/M/N, metadata fields like A–I).
The goal is to evaluate whether and how to transform this into a normalized relational model.
---
## Topic 1: Wide vs Normalized Financial Data Models
### Decision
**Move to a normalized fact table** (`K1LineItem`) for Part III financial data (boxes 1–21), but **keep a JSON metadata column** for Part I/II identity fields (A–I, J–N) that are queried infrequently.
### Rationale
The current JSON blob approach has these specific weaknesses for analytics:
**Query limitations observed in this codebase:**
1. **No SQL-level filtering or aggregation** — The `computeForKDocument()` method in `k1-aggregation.service.ts` must fetch the entire `KDocument` row, deserialize JSON, and loop through `Object.entries(data)` in application code. This means you cannot write `SELECT SUM(amount) FROM ... WHERE box_number = '1' AND tax_year BETWEEN 2020 AND 2025` — every aggregation requires fetching and deserializing all rows.
2. **No indexes on values** — Cannot index `data->'1'` effectively in PostgreSQL JSONB for range queries. While GIN indexes support containment (`@>`), they don't help with `>`, `<`, or `BETWEEN` on numeric values within the JSON.
3. **No referential integrity** — A typo like `"9A"` vs `"9a"` silently creates bad data. The current `CellMapping` table defines valid box numbers, but nothing enforces that `KDocument.data` keys match them.
4. **Cross-document aggregation is O(n) deserialization** — To compute "total ordinary income (Box 1) across all partnerships for 2025," every KDocument row matching the year must be fetched and parsed. With 50+ partnerships × 5 years, this is 250+ JSON deserializations for one number.
5. **No partial update tracking** — When a KDocument transitions from ESTIMATED → FINAL, the entire JSON blob is replaced. `previousData` preserves the old blob but provides no field-level diff.
6. **Schema evolution is invisible** — If the IRS adds a Box 6d in 2027, there's no migration — it just appears as a new JSON key. This sounds convenient but means no validation, no type checking, and no discoverability for future NL-to-SQL.
**When the wide/JSON model is acceptable:**
- Archival storage of the complete raw extraction (already served by `K1ImportSession.rawExtraction`)
- Rarely-queried metadata fields (Part I/II: partnership name, EIN, addresses)
- Configurations and user preferences (already used for `Settings.settings`)
- Fewer than ~10 documents with no cross-document queries needed
**When it breaks down (the current situation):**
- Cross-entity/cross-year aggregation (core family office use case)
- Performance analytics over time (partnership returns by year)
- Tax planning queries ("show me all partnerships with Section 1231 losses > $10K")
- Audit trail at field granularity
- LLM-generated SQL queries (LLMs cannot reliably generate JSONB path expressions)
### Alternatives Considered
| Alternative | Pros | Cons |
|---|---|---|
| **Keep JSON blob** (status quo) | No migration, flexible schema | All query limitations above; blocks analytics roadmap |
| **JSONB with generated columns** | No schema change for K-1 fields; PostgreSQL 12+ supports `GENERATED ALWAYS AS (data->>'1')::numeric` | Max ~30 generated columns practical; doesn't scale to 80+ fields; still no FK integrity |
| **Wide table with 80+ columns** | Simple queries, strong typing | Extremely sparse (most K-1s populate ~20 of 80+ boxes); ALTER TABLE for every IRS form change; NULL-heavy |
| **Normalized fact table** (chosen) | SQL aggregation, indexes, FK integrity, LLM-friendly, field-level audit trail | More JOINs; migration effort; slightly more complex insert logic |
---
## Topic 2: EAV vs Normalized Tables for Tax Document Fields
### Decision
**Use a hybrid approach**: a single EAV-style fact table (`K1LineItem`) for all Part III financial line items, combined with a reference/dimension table (`K1BoxDefinition`) that provides metadata, typing, and validation rules. Keep Part I/II identity metadata as structured JSON on the KDocument.
This is technically EAV but with strong constraints — it's closer to a **typed fact table** pattern than classic unconstrained EAV.
### Rationale
**Why EAV is appropriate here (and usually isn't):**
Classic EAV fails because it loses type safety, makes queries verbose, and resists validation. K-1 data avoids these pitfalls because:
1. **Uniform value type** — All Part III financial values (boxes 1–21) are `Decimal` amounts. Unlike generic EAV where attributes might be strings, dates, booleans, or blobs, K-1 line items are uniformly monetary amounts with a known currency. This eliminates the "value_string / value_number / value_date" anti-pattern.
2. **Closed attribute set** — The IRS defines ~50 Part III line items. This is not open-ended. The `K1BoxDefinition` reference table enumerates all valid attributes, so there's no unbounded attribute sprawl.
3. **Natural query pattern** — The primary queries are aggregations across one attribute dimension: `SUM(amount) WHERE box_key = '1'`. This is exactly what EAV is good at — pivot-style aggregation across a known set of attributes.
4. **Sparse data** — A typical K-1 populates 15–25 of ~50 possible line items. A wide table would be 50–70% NULL. The EAV/fact table stores only populated fields, which is both space-efficient and semantically clearer.
**Proposed structure (conceptual):**
```
K1BoxDefinition (reference/dimension table)
├── boxKey VARCHAR PK -- "1", "9a", "11-ZZ*", "20-A"
├── label VARCHAR -- "Ordinary business income (loss)"
├── section VARCHAR -- "PART_III", "PART_I", "SECTION_J"
├── dataType VARCHAR -- "CURRENCY", "PERCENTAGE", "BOOLEAN", "TEXT"
├── sortOrder INT
├── irsFormLine VARCHAR -- "Box 1", "Box 9a", "Section J, Line 1"
└── description TEXT
K1LineItem (fact table — one row per box per KDocument)
├── id UUID PK
├── kDocumentId UUID FK → KDocument.id
├── boxKey VARCHAR FK → K1BoxDefinition.boxKey
├── amount DECIMAL(15,2) -- financial value (null for non-monetary)
├── textValue VARCHAR -- for text/boolean fields if needed
├── sourceConfidence DECIMAL(3,2) -- 0.00–1.00, from extraction
├── sourcePageNumber INT -- PDF page where extracted
├── sourceCoordinates JSON -- {x, y, width, height} on the page
├── isUserEdited BOOLEAN -- true if user modified during verification
├── createdAt TIMESTAMP
├── updatedAt TIMESTAMP
└── @@unique([kDocumentId, boxKey])
```
**Why not separate normalized tables for each box category:**
An alternative is dedicated tables: `K1IncomeItems`, `K1DeductionItems`, `K1CreditItems`, `K1CapitalAccount`, etc. This was rejected because:
- K-1 boxes don't cleanly partition into fixed categories (Box 11 "Other income" spans multiple categories via sub-codes)
- Sub-code boxes (11-A through 11-ZZ*, 13-A through 13-ZZ*, 20-A through 20-ZZ*) have partnership-specific meaning — the same structural pattern repeats across boxes
- It would require 6–8 tables with identical column shapes, making queries harder, not easier
- The `K1BoxDefinition` reference table provides the categorical metadata without needing separate physical tables
**Treatment of Part I/II metadata fields:**
Fields like Partnership EIN (Box A), Partner name (Box F), Section J percentages, and Section L capital account data are better stored as structured JSON on `KDocument` in a `metadata` column because:
- They're queried for display, not for aggregation
- They have heterogeneous types (strings, booleans, percentages, addresses)
- They identify the document rather than representing financial facts
- There are ~30 of them, and they're almost all populated (not sparse)
### Alternatives Considered
| Alternative | Pros | Cons |
|---|---|---|
| **Pure EAV (no reference table)** | Maximum flexibility | No validation of box keys; `CellMapping` already serves this role but without FK enforcement |
| **Wide table (one column per box)** | Simple SELECTs for specific boxes | 80+ columns; 50–70% NULLs; ALTER TABLE for new boxes; poor for cross-box aggregation |
| **Separate tables per box category** | Strong typing per category | 6–8 near-identical tables; complex UNION queries; sub-code boxes don't fit cleanly |
| **Hybrid EAV + reference table** (chosen) | Uniform fact table; strong FK validation; sparse-friendly; single query pattern for aggregation; field-level provenance | Pivot queries needed for "show one K-1 as a form"; slightly more complex writes |
---
## Topic 3: Financial Fact Tables for Tax Data
### Decision
**Model K-1 line items as a financial fact table** in a star-schema-inspired design, with KDocument as the central bridge to dimension tables (Partnership, Entity, TaxYear). Monetary values stored as `DECIMAL(15,2)` with explicit currency.
### Rationale
Financial data warehouses consistently use a fact/dimension pattern for tax line items:
**Star schema mapping for K-1 data:**
```
┌──────────────┐
│ Partnership │ (dimension)
│ ────────── │
│ id, name, │
│ type, ein │
└──────┬───────┘
┌──────────────┐ ┌──────┴───────┐ ┌──────────────────┐
│ Entity │────│ KDocument │────│ K1BoxDefinition │ (dimension)
│ (dimension) │ │ (bridge) │ │ ────────────────│
│ ────────── │ │ ────────── │ │ boxKey, label, │
│ id, name, │ │ id, taxYear,│ │ section, type │
│ type, taxId │ │ status │ └──────────────────┘
└──────────────┘ └──────┬───────┘
┌──────┴───────┐
│ K1LineItem │ (FACT)
│ ────────── │
│ amount, │
│ boxKey, │
│ confidence │
└──────────────┘
```
**Best practices from financial data warehousing applied here:**
1. **Additive facts only**`K1LineItem.amount` is fully additive: you can SUM across tax years, partnerships, entities, or box types. Non-additive data (percentages, booleans, text) is stored separately in `textValue` or on the KDocument metadata.
2. **Grain = one box value per K-1 document** — Each row in `K1LineItem` represents one financial amount from one K-1 for one tax year. This is the atomic grain. Aggregation rules from `CellAggregationRule` operate on this grain.
3. **Slowly changing dimensions**`PartnershipMembership` already handles SCD Type 2 (effective dates) for ownership percentages. `K1BoxDefinition` is SCD Type 1 (overwritten on IRS form changes, with version tracking if needed).
4. **Conformed dimensions**`Partnership` and `Entity` serve as conformed dimensions shared between K-1 facts, Distribution facts, and Valuation facts. A single `Entity` dimension joins to multiple fact tables.
5. **Currency handling** — Store amounts in the source currency with a `currency` column. The KDocument inherits currency from Partnership. Conversion to reporting currency happens at query time or in materialized views, never by mutating the fact.
6. **Decimal precision**`DECIMAL(15,2)` covers amounts up to $9,999,999,999,999.99. K-1 amounts from large partnerships (PE funds, hedge funds) can reach tens of millions. 15 digits provides headroom. Use 2 decimal places to match IRS reporting precision.
**Aggregation queries enabled by this model:**
```sql
-- Total ordinary income across all partnerships for 2025
SELECT SUM(li.amount)
FROM k1_line_item li
JOIN k_document kd ON li.k_document_id = kd.id
WHERE li.box_key = '1' AND kd.tax_year = 2025;
-- Income breakdown by entity for tax year 2025
SELECT e.name, li.box_key, SUM(li.amount)
FROM k1_line_item li
JOIN k_document kd ON li.k_document_id = kd.id
JOIN partnership p ON kd.partnership_id = p.id
JOIN partnership_membership pm ON pm.partnership_id = p.id
JOIN entity e ON pm.entity_id = e.id
WHERE kd.tax_year = 2025
GROUP BY e.name, li.box_key;
-- Partnership performance: Box 1 over time
SELECT kd.tax_year, p.name, li.amount
FROM k1_line_item li
JOIN k_document kd ON li.k_document_id = kd.id
JOIN partnership p ON kd.partnership_id = p.id
WHERE li.box_key = '1'
ORDER BY kd.tax_year;
```
These queries are impossible or impractical with the current JSON blob model.
### Alternatives Considered
| Alternative | Pros | Cons |
|---|---|---|
| **Snowflake schema (more normalization)** | Normalized box categories into sub-dimensions | Over-normalized for ~50 box types; extra JOINs for no benefit |
| **Flat denormalized reporting table** | Fastest reads; no JOINs | Write complexity; data duplication; hard to keep consistent |
| **OLAP cube / column store** | Best aggregation performance | Overkill for <10K rows; adds infrastructure complexity |
| **Star-schema-inspired fact table** (chosen) | Natural fit for K-1 aggregation queries; leverages existing dimensions; PostgreSQL handles this scale trivially | Requires JOINs for full context (acceptable) |
---
## Topic 4: Source Traceability in Financial Systems
### Decision
**Store extraction provenance at the line-item grain** — each `K1LineItem` records the source page number, bounding-box coordinates, raw extracted text, confidence score, and whether it was user-edited. The `K1ImportSession` retains the complete raw extraction as an immutable JSON snapshot.
### Rationale
The audit trail must support this flow:
```
Displayed aggregated number
→ K1LineItem (individual box value)
→ KDocument (which K-1, which year, which partnership)
→ K1ImportSession (extraction record)
→ Document (source PDF file)
→ Specific page + coordinates on that page
→ Raw extracted text before parsing
```
**Granularity levels and what to store where:**
| Level | Table | Fields | Purpose |
|---|---|---|---|
| **Aggregation** | Computed at query time | SUM/formula from `CellAggregationRule` | "Where does this total come from?" → list of K1LineItems |
| **Line item** | `K1LineItem` | `amount`, `boxKey`, `sourceConfidence`, `sourcePageNumber`, `sourceCoordinates`, `rawExtractedText`, `isUserEdited` | "What exactly was extracted and from where?" |
| **Document** | `K1ImportSession` | `rawExtraction` (full JSON), `extractionMethod`, `fileName` | "What did the system originally see?" (immutable after extraction) |
| **File** | `Document` | `filePath`, `fileSize`, `mimeType` | "Where is the original PDF?" |
**Key design principles:**
1. **Immutability of raw extraction**`K1ImportSession.rawExtraction` is written once at extraction time and never modified. `verifiedData` captures user edits. This provides a complete before/after audit trail.
2. **Coordinate-level provenance** — Current `k1-positions-dump.txt` shows the parser already extracts `x, y` coordinates for each text element. Storing `sourceCoordinates: {x, y, width, height}` on each `K1LineItem` enables a future "click to highlight in PDF" feature.
3. **Confidence as first-class data** — The system already computes confidence scores (0.0–1.0) during extraction. Persisting this on the line item (not just in the import session JSON) enables queries like "show me all low-confidence values across all partnerships" and supports audit prioritization.
4. **User edit tracking**`isUserEdited: boolean` distinguishes machine-extracted values from human-verified overrides. This is critical for audit and for training future extraction models.
5. **No deletion of source data** — When a KDocument transitions from ESTIMATED → FINAL, the old line items should be soft-versioned (via `KDocument.previousData` or a separate version table), not deleted.
**What NOT to store at line-item level:**
- Full PDF binary (stay on Document/filesystem)
- Complete OCR output for the entire page (stay on K1ImportSession.rawExtraction)
- Rendering coordinates for non-K-1 text on the page (not relevant)
### Alternatives Considered
| Alternative | Pros | Cons |
|---|---|---|
| **Provenance only at document level** | Simpler; fewer columns | Cannot trace an individual number back to a specific location on a page |
| **Separate provenance table** (K1LineItemProvenance) | Clean separation of concerns | Extra JOIN for every audit query; 1:1 relationship is usually better as columns |
| **Store full page image crops per line item** | Visual proof | Massive storage; PDF coordinates + original file are sufficient for re-rendering |
| **Provenance on line item** (chosen) | Direct traceability; no extra JOINs; enables "highlight in PDF"; supports audit queries | Slightly wider rows (acceptable for <10K rows) |
---
## Topic 5: PostgreSQL Materialized Views for Financial Reporting
### Decision
**Use materialized views for cross-partnership/cross-year aggregation dashboards**, refreshed on a schedule or triggered by KDocument changes. Use regular views for single-document or single-partnership queries. Do **not** use denormalized reporting tables.
### Rationale
**When to use each approach in this system:**
| Scenario | Approach | Reason |
|---|---|---|
| "Show Box 1–21 for one K-1" | Regular query on `K1LineItem` | Small result set; no aggregation; fast enough |
| "Total income by box for one partnership across years" | Regular SQL `GROUP BY` | <20 rows × <10 years = <200 rows; trivial for PostgreSQL |
| "Dashboard: all partnerships × all entities × 5 years" | **Materialized view** | Cross-joins across dimensions; 50 partnerships × 5 entities × 5 years × 20 boxes = 25,000 aggregated values; worth pre-computing |
| "Tax planning: find partnerships with specific loss patterns" | Materialized view or indexed view | Complex filtering across many K-1s |
| "YoY change in Box 1 by partnership" | Materialized view | Window functions over multiple years |
**Proposed materialized views:**
```sql
-- MV 1: K-1 Summary by Partnership/Year
CREATE MATERIALIZED VIEW mv_k1_partnership_year_summary AS
SELECT
kd.partnership_id,
kd.tax_year,
li.box_key,
bd.label,
bd.section,
SUM(li.amount) AS total_amount,
COUNT(*) AS line_count,
kd.filing_status
FROM k1_line_item li
JOIN k_document kd ON li.k_document_id = kd.id
JOIN k1_box_definition bd ON li.box_key = bd.box_key
GROUP BY kd.partnership_id, kd.tax_year, li.box_key, bd.label, bd.section, kd.filing_status;
-- MV 2: Entity-level Income Aggregation
CREATE MATERIALIZED VIEW mv_entity_income_summary AS
SELECT
e.id AS entity_id,
e.name AS entity_name,
kd.tax_year,
li.box_key,
SUM(li.amount * pm.ownership_percent / 100) AS allocated_amount
FROM k1_line_item li
JOIN k_document kd ON li.k_document_id = kd.id
JOIN partnership_membership pm ON pm.partnership_id = kd.partnership_id
JOIN entity e ON pm.entity_id = e.id
WHERE pm.effective_date <= make_date(kd.tax_year, 12, 31)
AND (pm.end_date IS NULL OR pm.end_date > make_date(kd.tax_year, 12, 31))
GROUP BY e.id, e.name, kd.tax_year, li.box_key;
```
**Refresh strategy:**
- **Trigger-based refresh**: After any KDocument insert/update/delete or status change to FINAL, refresh affected materialized views. In NestJS, this is a `@OnEvent('k-document.changed')` handler that calls `REFRESH MATERIALIZED VIEW CONCURRENTLY`.
- **`CONCURRENTLY` keyword**: Allows reads during refresh (requires a unique index on the MV). Essential for a multi-user system.
- **Frequency**: For a family office with <100 K-1s updated per year, refresh takes <1 second. No scheduling needed event-driven refresh is sufficient.
**Why not denormalized reporting tables:**
Denormalized tables (duplicating data into a flat reporting structure) require write-time consistency management — every KDocument change must update the reporting table transactionally. This is the pattern used in high-write OLTP systems, but K-1 data is low-write (<100 writes/year) and high-read (dashboards queried many times). Materialized views handle this perfectly with zero application-level sync logic.
**Why not computed/generated columns:**
PostgreSQL generated columns cannot reference other tables. Since aggregations span KDocument → K1LineItem → Partnership → Entity, generated columns are structurally insufficient.
### Alternatives Considered
| Alternative | Pros | Cons |
|---|---|---|
| **Application-level caching** (Redis/in-memory) | No DB schema changes | Cache invalidation complexity; doesn't help SQL-based analytics |
| **Denormalized reporting tables** | Fastest reads; works at any scale | Write-time maintenance burden; consistency bugs; overkill for <10K rows |
| **Regular views** (not materialized) | Always fresh; no refresh needed | Recomputed on every query; slow for cross-entity dashboards |
| **Materialized views** (chosen) | Pre-computed; concurrent reads; event-driven refresh; zero application-level sync | Slight staleness (mitigated by event-driven refresh); requires unique indexes for CONCURRENTLY |
---
## Topic 6: Migration Strategy from JSON Blob to Normalized Tables
### Decision
**Phase the migration in 3 steps**: (1) Create new tables alongside existing JSON, (2) Dual-write to both during a transition period, (3) Make normalized tables authoritative. **Keep the JSON blob immutable as an archive** — never delete it.
### Rationale
**Step 1: Additive schema changes (zero breaking changes)**
```
Migration 1: Create K1BoxDefinition table, seed with IRS default box definitions
Migration 2: Create K1LineItem table with FK to KDocument and K1BoxDefinition
Migration 3: Backfill K1LineItem from existing KDocument.data JSON blobs
```
The backfill migration for Step 3:
```sql
-- Pseudocode: For each KDocument, iterate JSON keys and insert K1LineItems
INSERT INTO k1_line_item (id, k_document_id, box_key, amount, created_at, updated_at)
SELECT
gen_random_uuid(),
kd.id,
je.key,
(je.value)::decimal,
kd.created_at,
NOW()
FROM k_document kd,
jsonb_each(kd.data::jsonb) AS je(key, value)
WHERE jsonb_typeof(je.value) = 'number';
```
**Step 2: Dual-write transition period**
During the transition:
- `k1-import.service.ts` `confirmImport()` writes to **both** `KDocument.data` (JSON) and `K1LineItem` (rows)
- Read operations gradually migrate from JSON-based to K1LineItem-based
- `k1-aggregation.service.ts` switches from JSON iteration to `SELECT SUM` on K1LineItem
- Run validation queries comparing JSON-derived totals to K1LineItem-derived totals
**Step 3: K1LineItem becomes authoritative**
- New features (dashboards, tax planning, LLM queries) read only from K1LineItem
- `KDocument.data` is retained as immutable archive but no longer written to for new documents
- `CellAggregationRule.sourceCells` continues to work — the boxKey values are the same strings
- `CellMapping` evolves into or is replaced by `K1BoxDefinition`
**Should the old JSON be kept immutable?**
**Yes, permanently.** Reasons:
1. **Audit requirement** — The JSON blob is the original imported representation. Regulatory and audit standards require preserving source data in its original form.
2. **Rollback safety** — If the migration has bugs, the JSON blob is the recovery source.
3. **Storage is trivial** — A JSON blob with ~30 key-value pairs is <1 KB. Even 1,000 KDocuments = <1 MB total. There's no storage pressure to delete it.
4. **Import session already preserves extraction**`K1ImportSession.rawExtraction` holds the pre-verification extraction. `KDocument.data` holds the post-verification snapshot. Both should survive indefinitely.
**Backward compatibility considerations:**
- The `KDocument.data` column type stays `Json` (not nullable, not removed)
- The existing `k-document-form.component.ts` UI reads from `KDocument.data` — it continues to work during transition
- The `computeForKDocument()` aggregation service works against JSON through the transition, then switches to K1LineItem queries
- No existing API contracts change — `GET /k-documents/:id` returns the same shape
**Handling the CellMapping → K1BoxDefinition transition:**
The existing `CellMapping` table (per-partnership box definitions) maps closely to the proposed `K1BoxDefinition`. The migration strategy:
- `K1BoxDefinition` absorbs the global (partnershipId = null) CellMapping records
- Per-partnership CellMapping overrides become per-partnership `K1BoxDefinition` rows (or remain as display-layer configuration separate from the data model)
- `CellMapping` fields like `isIgnored`, `isCustom` are presentation concerns that may not belong on the data-layer `K1BoxDefinition`
### Alternatives Considered
| Alternative | Pros | Cons |
|---|---|---|
| **Big-bang migration** (drop JSON, create tables, migrate in one step) | Clean; no dual-write complexity | Risk of data loss; requires full feature freeze; hard to validate |
| **Dual-write indefinitely** | Maximum safety | Permanent write overhead; divergence risk between JSON and rows |
| **Keep JSON as authoritative, add views** | No migration of writes | Doesn't solve the core query limitation; views over JSONB are slow |
| **Phased migration with immutable archive** (chosen) | Zero-downtime; incremental validation; rollback possible; preserves audit trail | Dual-write period adds complexity (bounded to weeks, not permanent) |
---
## Topic 7: Schema Design for Future LLM NL-to-SQL
### Decision
**Design tables with self-documenting names, add PostgreSQL `COMMENT ON` annotations for every table and column, use consistent naming conventions, and avoid ambiguity between similarly-named entities.**
### Rationale
LLMs generating SQL (via text-to-SQL or NL-to-SQL) work by receiving the schema as context and mapping natural language to table/column references. The schema itself is the prompt. Research from the Spider benchmark (Yale), BIRD benchmark, and production NL-to-SQL systems (e.g., Vanna.ai, DataHerald) identifies these factors as most impactful:
**1. Naming conventions that LLMs parse correctly:**
| Current Name | Problem | Proposed Name | Why Better |
|---|---|---|---|
| `KDocument` | "K" is ambiguous to LLMs | `k1_document` | Explicitly says "K-1" |
| `KDocument.data` | "data" is the most generic possible name | `k1_document.raw_data_json` | Describes what it holds |
| `K1LineItem.amount` | Could be confused with Distribution.amount | `k1_line_item.reported_amount` | Disambiguates |
| `CellMapping` | "Cell" is a spreadsheet term, not a tax term | `k1_box_definition` | Domain-specific |
| `CellAggregationRule` | LLMs may not connect "cell" to K-1 boxes | `k1_aggregation_rule` | Clearer context |
**Naming conventions to adopt:**
- `snake_case` for all table and column names (PostgreSQL convention; LLMs trained on more snake_case SQL than camelCase)
- Prefix K-1-specific tables with `k1_` to create a namespace
- Use `_id` suffix for all foreign keys
- Avoid abbreviations (`partnership_id` not `ptnr_id`)
- Use `_at` suffix for timestamps (`created_at`, `updated_at`)
- Use descriptive names over short names (`tax_year` not `yr`, `filing_status` not `status`)
**2. PostgreSQL COMMENT annotations:**
```sql
COMMENT ON TABLE k1_line_item IS 'Individual financial line item from an IRS Schedule K-1 (Form 1065). One row per box number per K-1 document.';
COMMENT ON COLUMN k1_line_item.box_key IS 'IRS K-1 box identifier such as "1" for ordinary income, "9a" for long-term capital gains, or "20-A" for other information code A.';
COMMENT ON COLUMN k1_line_item.reported_amount IS 'Dollar amount reported on this K-1 line item, in the partnership base currency. Negative values represent losses.';
COMMENT ON TABLE k1_box_definition IS 'Reference table of IRS Schedule K-1 box definitions. Maps box identifiers to human-readable labels and categories.';
```
LLM NL-to-SQL systems extract these comments as schema context. A model asked "what is total ordinary income?" can map "ordinary income" → `k1_box_definition.label = 'Ordinary business income (loss)'``box_key = '1'` → join to `k1_line_item`.
**3. Avoiding ambiguity:**
Current pain points for LLM-generated SQL:
- `Distribution.amount` vs `K1LineItem.amount` — an LLM asked "total distributions" might query the wrong table. Solution: `k1_line_item.reported_amount` vs `distribution.distribution_amount`.
- `Partnership` has `distributions`, `kDocuments`, `valuations` — naming all FK columns `partnership_id` is correct and expected by LLMs.
- `Entity` is overloaded (database entities, legal entities). The table comment must clarify: "A legal person or structure (trust, LLC, individual) that owns assets and receives K-1 allocations."
**4. Schema metadata table for LLM context:**
Consider a lightweight `schema_metadata` table or a markdown document that provides the LLM with:
- Table relationships in natural language
- Common query patterns with examples
- Business rules ("Box 19a distributions are allocated to entities by ownership percentage")
- Valid values for enum columns
This is cheaper than fine-tuning and more maintainable than few-shot prompts.
**5. Avoid patterns that confuse LLMs:**
| Anti-pattern | Why It Confuses LLMs | Alternative |
|---|---|---|
| JSON columns for queryable data | LLMs generate `->` / `->>` operators inconsistently | Normalized columns |
| Composite primary keys | LLMs often forget one part of the key in JOINs | Surrogate UUID PK + unique constraint |
| Polymorphic FKs (one FK, multiple target tables) | LLMs can't determine which table to JOIN | Separate FK columns |
| Generic column names (`type`, `status`, `data`, `value`) | Ambiguous across tables | Prefix with table context (`filing_status`, `box_data_type`) |
| Soft deletes (`is_deleted`) | LLMs forget the `WHERE is_deleted = false` filter | Use `end_date IS NULL` pattern (already in use for memberships) |
### Alternatives Considered
| Alternative | Pros | Cons |
|---|---|---|
| **No schema changes for LLM** | No work | LLM accuracy drops significantly with ambiguous/generic names; JSONB columns are nearly unusable for NL-to-SQL |
| **Fine-tune LLM on this schema** | Can handle any naming convention | Expensive; needs retraining on every schema change; vendor lock-in |
| **RAG over schema docs** | Flexible; schema-aware | Still limited by underlying schema quality; garbage-in-garbage-out |
| **Self-documenting schema + COMMENT annotations** (chosen) | Works with any LLM; zero runtime cost; maintainable; improves human readability too | Requires discipline to maintain comments on schema changes |
---
## Summary of Decisions
| # | Topic | Decision |
|---|---|---|
| 1 | Wide vs Normalized | Normalized fact table for Part III financial data; JSON retained for Part I/II metadata |
| 2 | EAV vs Normalized | Hybrid: typed EAV fact table (`K1LineItem`) with reference dimension (`K1BoxDefinition`); uniform `DECIMAL` value type avoids classic EAV pitfalls |
| 3 | Financial fact tables | Star-schema-inspired design with `K1LineItem` as fact, `KDocument`/`Partnership`/`Entity` as dimensions |
| 4 | Source traceability | Per-line-item provenance (page, coordinates, confidence, raw text, user-edit flag); K1ImportSession.rawExtraction as immutable full extraction archive |
| 5 | Materialized views | Event-driven materialized views for cross-entity dashboards; regular queries for single-document access |
| 6 | Migration strategy | 3-phase: additive tables → dual-write → K1LineItem authoritative; JSON blob kept immutable forever |
| 7 | LLM NL-to-SQL | Self-documenting `snake_case` names, `COMMENT ON` annotations, disambiguation of similar columns, `k1_` table prefix namespace |

37
specs/004-k1-scan-import/checklists/requirements.md

@ -0,0 +1,37 @@
# Specification Quality Checklist: Automated K-1 PDF Scanning & Model Object Creation
**Purpose**: Validate specification completeness and quality before proceeding to planning
**Created**: 2026-03-18
**Feature**: [spec.md](../spec.md)
## Content Quality
- [x] No implementation details (languages, frameworks, APIs)
- [x] Focused on user value and business needs
- [x] Written for non-technical stakeholders
- [x] All mandatory sections completed
## Requirement Completeness
- [x] No [NEEDS CLARIFICATION] markers remain
- [x] Requirements are testable and unambiguous
- [x] Success criteria are measurable
- [x] Success criteria are technology-agnostic (no implementation details)
- [x] All acceptance scenarios are defined
- [x] Edge cases are identified
- [x] Scope is clearly bounded
- [x] Dependencies and assumptions identified
## Feature Readiness
- [x] All functional requirements have clear acceptance criteria
- [x] User scenarios cover primary flows
- [x] Feature meets measurable outcomes defined in Success Criteria
- [x] No implementation details leak into specification
## Notes
- Spec depends on 001-family-office-transform models (Entity, Partnership, PartnershipMembership, KDocument, Distribution, Document) being implemented first
- V1 scoped to IRS Schedule K-1 Form 1065 only (not Form 1041 or 1120-S)
- OCR/document intelligence provider is intentionally left as an implementation detail
- All [NEEDS CLARIFICATION] items were resolved with reasonable defaults documented in the Assumptions section

525
specs/004-k1-scan-import/contracts/k1-import-api.md

@ -0,0 +1,525 @@
# API Contracts: K-1 Import
**Phase 1 Output** | **Date**: 2026-03-18 | **Updated**: 2026-03-18 (post-clarification)
## Base Path
All endpoints under `/api/v1/k1-import/`
## Authentication
All endpoints require JWT authentication (`AuthGuard('jwt')`) and appropriate permissions via `HasPermissionGuard`.
---
## Endpoints
### POST /api/v1/k1-import/upload
Upload a K-1 PDF and initiate extraction.
**Permission**: `createKDocument`
**Request**: `multipart/form-data`
| Field | Type | Required | Description |
| --------------- | -------- | -------- | ------------------------------------ |
| `file` | File | Yes | PDF file (max 25 MB, MIME: application/pdf) |
| `partnershipId` | `string` | Yes | Target partnership UUID |
| `taxYear` | `number` | Yes | Tax year for this K-1 |
**Response**: `201 Created`
```json
{
"id": "uuid",
"partnershipId": "uuid",
"status": "PROCESSING",
"taxYear": 2025,
"fileName": "K1-Smith-Capital-2025.pdf",
"fileSize": 245760,
"extractionMethod": "pdf-parse",
"createdAt": "2026-03-18T00:00:00.000Z"
}
```
**Errors**:
| Status | Condition |
| ------ | -------------------------------------- |
| 400 | File is not a valid PDF |
| 400 | File exceeds 25 MB size limit |
| 400 | Partnership not found or not owned by user |
| 400 | Partnership has no active members |
| 400 | Tax year < partnership inception year |
---
### GET /api/v1/k1-import/:id
Get the current state of an import session, including extraction results.
**Permission**: `readKDocument`
**Response**: `200 OK`
```json
{
"id": "uuid",
"partnershipId": "uuid",
"status": "EXTRACTED",
"taxYear": 2025,
"fileName": "K1-Smith-Capital-2025.pdf",
"fileSize": 245760,
"extractionMethod": "pdf-parse",
"rawExtraction": {
"metadata": {
"partnershipName": "Smith Capital Partners LP",
"partnershipEin": "12-3456789",
"partnerName": "Smith Family Trust",
"partnerEin": "98-7654321",
"taxYear": 2025,
"isAmended": false,
"isFinal": true
},
"fields": [
{
"boxNumber": "1",
"label": "Ordinary business income (loss)",
"customLabel": null,
"rawValue": "$52,340",
"numericValue": 52340,
"confidence": 0.95,
"confidenceLevel": "HIGH",
"isUserEdited": false
}
],
"overallConfidence": 0.92,
"method": "pdf-parse",
"pagesProcessed": 2
},
"verifiedData": null,
"documentId": "uuid",
"kDocumentId": null,
"errorMessage": null,
"createdAt": "2026-03-18T00:00:00.000Z",
"updatedAt": "2026-03-18T00:00:05.000Z"
}
```
**Errors**:
| Status | Condition |
| ------ | ----------------------------------- |
| 404 | Import session not found |
| 403 | Import session belongs to different user |
---
### PUT /api/v1/k1-import/:id/verify
Submit user-verified/edited extraction data. Transitions status from EXTRACTED to VERIFIED.
**Permission**: `updateKDocument`
**Request**: `application/json`
```json
{
"taxYear": 2025,
"fields": [
{
"boxNumber": "1",
"label": "Ordinary business income (loss)",
"customLabel": null,
"rawValue": "$52,340",
"numericValue": 52340,
"confidence": 0.95,
"confidenceLevel": "HIGH",
"isUserEdited": false,
"isReviewed": true
},
{
"boxNumber": "11",
"label": "Other income (loss)",
"customLabel": "Section 1256 contracts",
"rawValue": "$8,200",
"numericValue": 8200,
"confidence": 0.72,
"confidenceLevel": "MEDIUM",
"isUserEdited": true,
"isReviewed": true
}
],
"unmappedItems": [
{
"rawLabel": "State tax adjustment",
"rawValue": "$1,200",
"numericValue": 1200,
"confidence": 0.65,
"pageNumber": 3,
"resolution": "discarded",
"assignedBoxNumber": null
}
]
}
```
**Response**: `200 OK` — Updated import session with status `VERIFIED`
**Errors**:
| Status | Condition |
| ------ | ----------------------------------------------- |
| 400 | Import session is not in EXTRACTED status |
| 400 | Fields array is empty |
| 400 | Medium/low-confidence fields not all reviewed (isReviewed must be true) |
| 400 | Unmapped items not all resolved (each must be 'assigned' or 'discarded') |
| 404 | Import session not found |
---
### POST /api/v1/k1-import/:id/confirm
Confirm verified data and trigger automatic model object creation (KDocument, Distributions, Document linkage).
**Permission**: `createKDocument`
**Request**: `application/json`
```json
{
"filingStatus": "DRAFT",
"existingKDocumentAction": null
}
```
| Field | Type | Required | Description |
| ------------------------- | ----------------------------- | -------- | ---------------------------------------- |
| `filingStatus` | `"DRAFT" \| "ESTIMATED" \| "FINAL"` | Yes | Status for the created/updated KDocument |
| `existingKDocumentAction` | `"UPDATE" \| "CREATE_NEW" \| null` | No | Action if KDocument already exists |
**Response**: `201 Created`
```json
{
"importSession": {
"id": "uuid",
"status": "CONFIRMED"
},
"kDocument": {
"id": "uuid",
"partnershipId": "uuid",
"type": "K1",
"taxYear": 2025,
"filingStatus": "DRAFT",
"data": { "ordinaryIncome": 52340, "..." : "..." }
},
"distributions": [
{
"id": "uuid",
"entityId": "uuid",
"partnershipId": "uuid",
"type": "RETURN_OF_CAPITAL",
"amount": 60000,
"date": "2025-12-31T00:00:00.000Z"
}
],
"allocations": [
{
"entityId": "uuid",
"entityName": "Smith Family Trust",
"ownershipPercent": 60,
"allocatedValues": { "ordinaryIncome": 31404, "..." : "..." }
}
],
"document": {
"id": "uuid",
"type": "K1",
"name": "K1-Smith-Capital-2025.pdf"
}
}
```
**Errors**:
| Status | Condition |
| ------ | ------------------------------------------------------------- |
| 400 | Import session is not in VERIFIED status |
| 400 | Partnership has no active members |
| 409 | KDocument already exists for this partnership/year and no action specified |
---
### POST /api/v1/k1-import/:id/cancel
Cancel an import session. No model objects are created.
**Permission**: `updateKDocument`
**Response**: `200 OK` — Updated import session with status `CANCELLED`
**Errors**:
| Status | Condition |
| ------ | --------------------------------------------- |
| 400 | Import session is already CONFIRMED or CANCELLED |
| 404 | Import session not found |
---
### GET /api/v1/k1-import/history
List import sessions for a partnership, ordered by creation date descending.
**Permission**: `readKDocument`
**Query Parameters**:
| Param | Type | Required | Description |
| --------------- | -------- | -------- | ------------------------------ |
| `partnershipId` | `string` | Yes | Partnership UUID |
| `taxYear` | `number` | No | Filter by tax year |
**Response**: `200 OK` — Array of import session summaries
```json
[
{
"id": "uuid",
"partnershipId": "uuid",
"status": "CONFIRMED",
"taxYear": 2025,
"fileName": "K1-Smith-Capital-2025.pdf",
"extractionMethod": "pdf-parse",
"kDocumentId": "uuid",
"createdAt": "2026-03-18T00:00:00.000Z"
}
]
```
---
### POST /api/v1/k1-import/:id/reprocess
Re-run extraction on a previously uploaded PDF using the current cell mapping configuration.
**Permission**: `updateKDocument`
**Response**: `200 OK` — New import session with status `PROCESSING` (original session unchanged)
**Errors**:
| Status | Condition |
| ------ | ------------------------------------------- |
| 400 | Original import session has no stored document |
| 404 | Import session not found |
---
## Cell Mapping Endpoints
### GET /api/v1/cell-mapping
Get cell mappings for a partnership (with global defaults for unmapped boxes).
**Permission**: `readKDocument`
**Query Parameters**:
| Param | Type | Required | Description |
| --------------- | -------- | -------- | ---------------------------------------- |
| `partnershipId` | `string` | No | Partnership UUID (omit for global defaults) |
**Response**: `200 OK`
```json
[
{
"id": "uuid",
"partnershipId": null,
"boxNumber": "1",
"label": "Ordinary business income (loss)",
"description": "IRS Schedule K-1 Box 1",
"isCustom": false,
"sortOrder": 1
}
]
```
---
### PUT /api/v1/cell-mapping
Update or create cell mappings for a partnership.
**Permission**: `updateKDocument`
**Request**: `application/json`
```json
{
"partnershipId": "uuid",
"mappings": [
{
"boxNumber": "11",
"label": "Section 1256 contracts",
"description": "Custom label for Box 11",
"isCustom": false
},
{
"boxNumber": "20-Z",
"label": "Qualified Business Income (Section 199A)",
"description": "Custom additional box",
"isCustom": true
}
]
}
```
**Response**: `200 OK` — Updated mappings array
---
### DELETE /api/v1/cell-mapping/reset
Reset a partnership's cell mappings to IRS defaults (deletes all custom mappings for the partnership).
**Permission**: `updateKDocument`
**Query Parameters**:
| Param | Type | Required | Description |
| --------------- | -------- | -------- | ------------------ |
| `partnershipId` | `string` | Yes | Partnership UUID |
**Response**: `200 OK`
---
## Aggregation Rule Endpoints
### GET /api/v1/cell-mapping/aggregation-rules
Get aggregation rules for a partnership (with global defaults for partnerships without custom rules).
**Permission**: `readKDocument`
**Query Parameters**:
| Param | Type | Required | Description |
| --------------- | -------- | -------- | ---------------------------------------------- |
| `partnershipId` | `string` | No | Partnership UUID (omit for global defaults) |
**Response**: `200 OK`
```json
[
{
"id": "uuid",
"partnershipId": null,
"name": "Total Ordinary Income",
"operation": "SUM",
"sourceCells": ["1"],
"sortOrder": 1
},
{
"id": "uuid",
"partnershipId": null,
"name": "Total Capital Gains",
"operation": "SUM",
"sourceCells": ["8", "9a", "9b", "9c", "10"],
"sortOrder": 2
},
{
"id": "uuid",
"partnershipId": null,
"name": "Total Deductions",
"operation": "SUM",
"sourceCells": ["12", "13"],
"sortOrder": 3
}
]
```
---
### PUT /api/v1/cell-mapping/aggregation-rules
Create or update aggregation rules for a partnership.
**Permission**: `updateKDocument`
**Request**: `application/json`
```json
{
"partnershipId": "uuid",
"rules": [
{
"name": "Income Summary",
"operation": "SUM",
"sourceCells": ["1", "2", "3", "4b", "5", "6a", "7"]
},
{
"name": "Total Capital Gains",
"operation": "SUM",
"sourceCells": ["8", "9a", "10"]
}
]
}
```
**Response**: `200 OK` — Updated rules array
**Errors**:
| Status | Condition |
| ------ | ---------------------------------------------------- |
| 400 | Source cell box number not found in cell mappings |
| 400 | Duplicate rule name for the same partnership |
| 400 | Empty sourceCells array |
---
### GET /api/v1/cell-mapping/aggregation-rules/compute
Compute aggregation values for a specific KDocument's data. Returns the dynamically calculated totals.
**Permission**: `readKDocument`
**Query Parameters**:
| Param | Type | Required | Description |
| --------------- | -------- | -------- | ---------------------------------------------- |
| `kDocumentId` | `string` | Yes | KDocument UUID to compute aggregates for |
| `partnershipId` | `string` | No | Override which partnership's rules to use |
**Response**: `200 OK`
```json
[
{
"ruleId": "uuid",
"name": "Income Summary",
"operation": "SUM",
"sourceCells": ["1", "2", "3", "4b", "5", "6a", "7"],
"computedValue": 187520.00,
"breakdown": {
"1": 52340,
"2": 35000,
"3": 0,
"4b": 15000,
"5": 8200,
"6a": 72980,
"7": 4000
}
}
]
```
**Errors**:
| Status | Condition |
| ------ | --------------------------- |
| 404 | KDocument not found |

300
specs/004-k1-scan-import/data-model.md

@ -0,0 +1,300 @@
# Data Model: K-1 PDF Scan Import
**Phase 1 Output** | **Date**: 2026-03-18 | **Updated**: 2026-03-18 (post-clarification)
## Overview
This feature adds 3 new Prisma models and 1 new enum to support K-1 PDF scanning, import session tracking, cell mapping configuration, and aggregation rules. It extends the existing models from spec 001-family-office-transform (KDocument, Distribution, Document, PartnershipMembership) with automatic creation from scanned data.
### Entity Relationship Diagram (Conceptual)
```
User (existing)
└── Partnership (existing from 001)
├── K1ImportSession[] ──┬── Document (uploaded PDF, existing from 001)
│ (new model) ├── KDocument (auto-created, existing from 001)
│ └── CellMapping (per-partnership config)
├── PartnershipMembership[] (existing from 001)
│ └── [K-1 allocations computed at confirm time]
├── KDocument[] (existing from 001)
│ └── Distribution[] (auto-created from Box 19, existing from 001)
├── CellMapping[] (new model, per-partnership overrides)
└── CellAggregationRule[] (new model, per-partnership or global)
└── [computed totals derived dynamically from raw box values]
Global CellMapping (partnershipId = null) ── IRS default box definitions
Global CellAggregationRule (partnershipId = null) ── default summary rules
```
## New Enum
### K1ImportStatus
Tracks the lifecycle of a K-1 import session.
| Value | Description |
| ------------ | -------------------------------------------------------------- |
| `PROCESSING` | PDF uploaded, extraction in progress |
| `EXTRACTED` | Extraction complete, awaiting user review |
| `VERIFIED` | User has reviewed/edited values, ready for confirmation |
| `CONFIRMED` | User confirmed, model objects created (KDocument, Distributions) |
| `CANCELLED` | User cancelled, no model objects created |
| `FAILED` | Extraction failed (invalid PDF, OCR error, etc.) |
## New Models
### K1ImportSession
A record of a single K-1 PDF import attempt, tracking the full lifecycle from upload through confirmation.
| Field | Type | Constraints | Description |
| ------------------ | ---------------- | ---------------------------- | --------------------------------------------------------------- |
| `id` | `String` | PK, UUID, auto-generated | Unique identifier |
| `partnershipId` | `String` | FK → Partnership.id, indexed | Target partnership for this K-1 import |
| `userId` | `String` | FK → User.id, indexed | User who initiated the import |
| `status` | `K1ImportStatus` | Required, Default: PROCESSING | Current lifecycle status |
| `taxYear` | `Int` | Required | Tax year extracted or specified by user |
| `fileName` | `String` | Required | Original filename of uploaded PDF |
| `fileSize` | `Int` | Required | File size in bytes |
| `extractionMethod` | `String` | Required | Method used: "pdf-parse", "azure", "tesseract" |
| `rawExtraction` | `Json?` | Optional | Raw extraction results before user edits |
| `verifiedData` | `Json?` | Optional | User-verified/edited extraction results (K1ExtractionResult) |
| `documentId` | `String?` | FK → Document.id, optional | Linked uploaded PDF Document record |
| `kDocumentId` | `String?` | FK → KDocument.id, optional | Resulting KDocument (set on CONFIRMED status) |
| `errorMessage` | `String?` | Optional | Error details if status is FAILED |
| `createdAt` | `DateTime` | Default: now() | Upload timestamp |
| `updatedAt` | `DateTime` | Auto-updated | Last modification timestamp |
**Relations**:
- `partnership``Partnership` (many-to-one, cascade delete)
- `user``User` (many-to-one, cascade delete)
- `document``Document?` (many-to-one, optional)
- `kDocument``KDocument?` (many-to-one, optional)
**Indexes**: `@@index([partnershipId, taxYear])` for import history queries per partnership/year.
### CellMapping
A configuration defining how K-1 box numbers map to labels. Supports a global IRS-default set (partnershipId = null) and per-partnership customizations.
| Field | Type | Constraints | Description |
| --------------- | ---------- | -------------------------------------- | ---------------------------------------------------- |
| `id` | `String` | PK, UUID, auto-generated | Unique identifier |
| `partnershipId` | `String?` | FK → Partnership.id, optional, indexed | Partnership this mapping applies to (null = global) |
| `boxNumber` | `String` | Required | K-1 box identifier (e.g., "1", "6a", "19a", "20-A") |
| `label` | `String` | Required | Display label (e.g., "Ordinary business income") |
| `description` | `String?` | Optional | Extended description or IRS instructions |
| `isCustom` | `Boolean` | Default: false | Whether this is a user-added custom cell |
| `sortOrder` | `Int` | Required | Display order in the verification screen |
| `createdAt` | `DateTime` | Default: now() | Creation timestamp |
| `updatedAt` | `DateTime` | Auto-updated | Last modification timestamp |
**Relations**:
- `partnership``Partnership?` (many-to-one, optional, cascade delete)
**Unique constraint**: `@@unique([partnershipId, boxNumber])` — one mapping per box per partnership (or per box globally when partnershipId is null).
### CellAggregationRule
A named rule that combines multiple K-1 cells into a computed summary value. Computed totals are NOT stored — they are derived dynamically from raw box values each time they are displayed (FR-039).
| Field | Type | Constraints | Description |
| --------------- | ---------- | -------------------------------------- | --------------------------------------------------------------- |
| `id` | `String` | PK, UUID, auto-generated | Unique identifier |
| `partnershipId` | `String?` | FK → Partnership.id, optional, indexed | Partnership this rule applies to (null = global default) |
| `name` | `String` | Required | Display name (e.g., "Income Summary", "Total Capital Gains") |
| `operation` | `String` | Required, Default: "SUM" | Aggregation operation (SUM for V1; future: AVG, MIN, MAX) |
| `sourceCells` | `Json` | Required | Array of box numbers to aggregate (e.g., ["1", "2", "3"]) |
| `sortOrder` | `Int` | Required | Display order in the aggregation summary section |
| `createdAt` | `DateTime` | Default: now() | Creation timestamp |
| `updatedAt` | `DateTime` | Auto-updated | Last modification timestamp |
**Relations**:
- `partnership``Partnership?` (many-to-one, optional, cascade delete)
**Unique constraint**: `@@unique([partnershipId, name])` — one rule per name per partnership (or globally).
**Note**: No `computedValue` column. Totals are always computed on-the-fly from the KDocument's raw box values using the `sourceCells` array and `operation`. This ensures summaries auto-update when underlying values change (e.g., estimated→final K-1 transition).
## Modifications to Existing Models
### Partnership (from spec 001)
Add back-references — no column changes:
| New Field | Type | Description |
| -------------------- | ------------------------ | ------------------------------------ |
| `importSessions` | `K1ImportSession[]` | Import attempts for this partnership |
| `cellMappings` | `CellMapping[]` | Custom cell mapping configurations |
| `aggregationRules` | `CellAggregationRule[]` | Custom aggregation rule definitions |
### KDocument (from spec 001)
Add back-reference — no column changes:
| New Field | Type | Description |
| ---------------- | ------------------- | ---------------------------------------- |
| `importSession` | `K1ImportSession?` | Import session that created this record |
## Application-Layer Types
### K1ExtractionResult (TypeScript interface)
The structure returned by the extraction service and stored in `K1ImportSession.rawExtraction` and `K1ImportSession.verifiedData`.
```typescript
interface K1ExtractionResult {
/** Extracted metadata from the K-1 header */
metadata: {
partnershipName: string | null;
partnershipEin: string | null;
partnerName: string | null;
partnerEin: string | null;
taxYear: number | null;
isAmended: boolean;
isFinal: boolean;
};
/** Extracted box values — mapped to known cells */
fields: K1ExtractedField[];
/** Extracted values that didn't match any configured cell mapping */
unmappedItems: K1UnmappedItem[];
/** Overall extraction confidence (0.0–1.0) */
overallConfidence: number;
/** Extraction method used */
method: 'pdf-parse' | 'azure' | 'tesseract';
/** Number of pages processed */
pagesProcessed: number;
}
interface K1ExtractedField {
/** Box identifier (e.g., "1", "6a", "19a") */
boxNumber: string;
/** Display label from cell mapping */
label: string;
/** Custom label override by user (null if not overridden) */
customLabel: string | null;
/** Extracted raw text value */
rawValue: string;
/** Parsed numeric value (null if unparseable) */
numericValue: number | null;
/** Confidence score (0.0–1.0) */
confidence: number;
/** Confidence level for display */
confidenceLevel: 'HIGH' | 'MEDIUM' | 'LOW';
/** Whether user has manually edited this value */
isUserEdited: boolean;
/** Whether user has explicitly reviewed this field (required for medium/low confidence) */
isReviewed: boolean;
}
interface K1UnmappedItem {
/** Raw text label extracted from the PDF */
rawLabel: string;
/** Raw text value extracted */
rawValue: string;
/** Parsed numeric value (null if unparseable) */
numericValue: number | null;
/** Confidence score (0.0–1.0) */
confidence: number;
/** Page number where this was extracted */
pageNumber: number;
/** User action: 'assigned' (to a cell), 'discarded', or null (pending) */
resolution: 'assigned' | 'discarded' | null;
/** If assigned, the box number it was assigned to */
assignedBoxNumber: string | null;
}
```
### K1ConfirmationRequest (TypeScript interface)
The request body when the user confirms verified K-1 data.
```typescript
interface K1ConfirmationRequest {
/** Import session ID */
importSessionId: string;
/** Tax year (may have been overridden by user) */
taxYear: number;
/** Filing status for the new KDocument */
filingStatus: 'DRAFT' | 'ESTIMATED' | 'FINAL';
/** Verified fields with any user edits applied */
fields: K1ExtractedField[];
/** Whether to update an existing KDocument (null = create new) */
existingKDocumentAction: 'UPDATE' | 'CREATE_NEW' | null;
}
```
### Default IRS K-1 Cell Mapping
The standard box definitions seeded as global CellMapping records (partnershipId = null):
| boxNumber | label | sortOrder |
| --------- | ----------------------------------------- | --------- |
| 1 | Ordinary business income (loss) | 1 |
| 2 | Net rental real estate income (loss) | 2 |
| 3 | Other net rental income (loss) | 3 |
| 4 | Guaranteed payments for services | 4 |
| 4a | Guaranteed payments for capital | 5 |
| 4b | Total guaranteed payments | 6 |
| 5 | Interest income | 7 |
| 6a | Ordinary dividends | 8 |
| 6b | Qualified dividends | 9 |
| 6c | Dividend equivalents | 10 |
| 7 | Royalties | 11 |
| 8 | Net short-term capital gain (loss) | 12 |
| 9a | Net long-term capital gain (loss) | 13 |
| 9b | Collectibles (28%) gain (loss) | 14 |
| 9c | Unrecaptured section 1250 gain | 15 |
| 10 | Net section 1231 gain (loss) | 16 |
| 11 | Other income (loss) | 17 |
| 12 | Section 179 deduction | 18 |
| 13 | Other deductions | 19 |
| 14 | Self-employment earnings (loss) | 20 |
| 15 | Credits | 21 |
| 16 | Foreign transactions | 22 |
| 17 | Alternative minimum tax (AMT) items | 23 |
| 18 | Tax-exempt income and nondeductible expenses | 24 |
| 19a | Distributions — Cash and marketable securities | 25 |
| 19b | Distributions — Other property | 26 |
| 20 | Other information | 27 |
| 21 | Foreign taxes paid or accrued | 28 |
## Validation Rules
1. **Import session partnership**: Must reference an existing partnership owned by the current user.
2. **Import session tax year**: Must be ≥ year of the partnership's inception date.
3. **File upload**: Must be a valid PDF, ≤ 25 MB. System rejects non-PDF MIME types.
4. **Extraction status transitions**: Only valid transitions: PROCESSING → EXTRACTED → VERIFIED → CONFIRMED/CANCELLED, or PROCESSING → FAILED. No backwards transitions.
5. **Cell mapping uniqueness**: One mapping per (partnershipId, boxNumber). Custom mappings for a partnership override the global default for that box number.
6. **Confirmation prerequisites**: Can only confirm when status is VERIFIED, partnership has at least one active member, and verifiedData is not null.
7. **Duplicate KDocument check**: Before creating a KDocument, check for existing (partnershipId, type=K1, taxYear). If found, require explicit user decision (update existing or reject).
8. **Distribution allocation**: Box 19a/19b amounts are allocated to members by ownership percentage as of the tax year's fiscal year end. Allocation amounts must sum exactly to the partnership-level total (handle rounding by adjusting the largest member's allocation).
9. **Aggregation rule source cells**: All box numbers in `sourceCells` must reference valid cell mapping entries. If a source cell has no value in the KDocument, it contributes 0 to the aggregate.
10. **Unmapped items resolution**: All unmapped items must be resolved (assigned to a cell or discarded) before the import session can transition to VERIFIED status.
11. **Review requirement**: All medium and low-confidence fields must have `isReviewed: true` before confirmation is allowed (FR-035). High-confidence fields are auto-set to `isReviewed: true`.

124
specs/004-k1-scan-import/plan.md

@ -0,0 +1,124 @@
# Implementation Plan: K-1 PDF Scan Import
**Branch**: `004-k1-scan-import` | **Date**: 2026-03-18 | **Spec**: [spec.md](spec.md)
**Input**: Feature specification from `/specs/004-k1-scan-import/spec.md`
## Summary
Automated K-1 PDF scanning that extracts structured IRS Schedule K-1 (Form 1065) data from uploaded PDFs, presents a verification screen with auto-accepted high-confidence values and explicit review for medium/low-confidence fields, and auto-creates downstream model objects (KDocument, Distributions, member allocations, Document). Uses a two-tier extraction approach: `pdf-parse` for digital PDFs (free, instant, local) and Azure AI Document Intelligence / `tesseract.js` fallback for scanned PDFs. Supports per-partnership cell mapping customization, administrator-defined aggregation rules (dynamically computed summaries displayed on verification screen and KDocument detail view), an "Unmapped Items" section for unrecognized extractions, and import history with re-processing.
## Technical Context
**Language/Version**: TypeScript 5.9.2, Node.js ≥ 22.18.0
**Primary Dependencies**: NestJS 11.x (backend), Angular 21.x (frontend), Prisma 6.x (ORM), pdf-parse (PDF text), @azure/ai-form-recognizer (cloud OCR), tesseract.js (local OCR fallback)
**Storage**: PostgreSQL via Prisma (structured data), local filesystem `uploads/` (PDF files)
**Testing**: Jest (unit + integration), test K-1 PDF fixtures in `test/import/`
**Target Platform**: Docker (node:22-slim), self-hosted or Railway
**Project Type**: Web application (NestJS API + Angular SPA) — Nx monorepo
**Performance Goals**: PDF extraction < 30 seconds (SC-001), model creation < 5 seconds (SC-005), 90%+ accuracy for digital PDFs (SC-002)
**Constraints**: Self-hosted capable (Azure OCR optional), max PDF size 25 MB, K-1 Form 1065 only (V1)
**Scale/Scope**: Single family office (10–50 partnerships, 10–50 K-1s/year), 2 new API modules, 4 new frontend pages
## Constitution Check
_GATE: Must pass before Phase 0 research. Re-check after Phase 1 design._
No constitution.md exists for this project. Gates assessed against standard engineering principles:
| Gate | Status | Notes |
|------|--------|-------|
| No unnecessary dependencies | PASS | 3 new packages (`pdf-parse`, `@azure/ai-form-recognizer`, `tesseract.js`) — each serves a distinct, justified purpose per research.md |
| Follows existing patterns | PASS | New NestJS modules follow existing controller/service/DTO pattern (mirrors `k-document`, `upload` modules) |
| No breaking changes | PASS | 3 new Prisma models + 1 enum, back-references only on existing models — no column changes |
| Test coverage | PASS | Unit tests for extractors, mapper, allocation, aggregation; integration tests for full pipeline |
| Self-hosted compatible | PASS | Core extraction (pdf-parse) is fully local; Azure is optional with tesseract.js fallback |
**Post-Phase 1 re-check**: PASS — data model adds 3 models/1 enum (K1ImportSession, CellMapping, CellAggregationRule, K1ImportStatus). No existing schema changes beyond back-references. API contracts follow existing REST patterns. Aggregation rules are dynamically computed — no stored denormalization. No violations identified.
## Project Structure
### Documentation (this feature)
```text
specs/004-k1-scan-import/
├── plan.md # This file
├── research.md # Phase 0: OCR provider research & decisions
├── data-model.md # Phase 1: K1ImportSession, CellMapping, CellAggregationRule models
├── quickstart.md # Phase 1: Setup & dev guide
├── contracts/
│ └── k1-import-api.md # Phase 1: REST API contracts
├── checklists/
│ └── requirements.md # Spec quality checklist
└── tasks.md # Phase 2 output (created by /speckit.tasks)
```
### Source Code (repository root)
```text
apps/api/src/app/
├── k1-import/
│ ├── k1-import.module.ts
│ ├── k1-import.controller.ts
│ ├── k1-import.service.ts
│ ├── dto/
│ │ ├── upload-k1.dto.ts
│ │ ├── verify-k1.dto.ts
│ │ └── confirm-k1.dto.ts
│ ├── extractors/
│ │ ├── k1-extractor.interface.ts
│ │ ├── pdf-parse-extractor.ts
│ │ ├── azure-extractor.ts
│ │ └── tesseract-extractor.ts
│ ├── k1-field-mapper.service.ts
│ ├── k1-allocation.service.ts
│ ├── k1-confidence.service.ts
│ └── k1-aggregation.service.ts # Dynamically computes aggregation summaries
├── cell-mapping/
│ ├── cell-mapping.module.ts
│ ├── cell-mapping.controller.ts # Cell mapping + aggregation rule CRUD
│ └── cell-mapping.service.ts
apps/client/src/app/
├── pages/
│ ├── k1-import/
│ │ ├── k1-import-page.component.ts
│ │ ├── k1-import-page.html
│ │ ├── k1-import-page.scss
│ │ ├── k1-import-page.routes.ts
│ │ ├── k1-verification/
│ │ │ ├── k1-verification.component.ts # Mapped cells + unmapped items + aggregations
│ │ │ ├── k1-verification.html
│ │ │ └── k1-verification.scss
│ │ └── k1-confirmation/
│ │ ├── k1-confirmation.component.ts
│ │ ├── k1-confirmation.html
│ │ └── k1-confirmation.scss
│ ├── cell-mapping/
│ │ ├── cell-mapping-page.component.ts # Cell mapping + aggregation rule config
│ │ ├── cell-mapping-page.html
│ │ └── cell-mapping-page.routes.ts
│ └── k-document/ # Existing page — extended
│ └── k-document-detail/ # Add aggregation summary section (FR-036)
├── services/
│ └── k1-import-data.service.ts
libs/common/src/lib/
├── interfaces/
│ └── k1-import.interface.ts
├── dtos/
│ └── k1-import/
│ ├── create-k1-import.dto.ts
│ ├── verify-k1-import.dto.ts
│ └── confirm-k1-import.dto.ts
prisma/
├── schema.prisma # + K1ImportSession, CellMapping, CellAggregationRule, K1ImportStatus
├── migrations/
│ └── 2026XXXX_added_k1_import/ # New migration
test/import/
├── sample-k1-digital.pdf # Test fixture: digital K-1
└── sample-k1-scanned.pdf # Test fixture: scanned K-1
```
**Structure Decision**: Follows the existing Nx monorepo convention with new NestJS modules under `apps/api/src/app/` and new Angular pages under `apps/client/src/app/pages/`. Shared interfaces and DTOs in `libs/common/`. This mirrors the existing `k-document`, `upload`, and `family-office` module patterns. The KDocument detail view is extended (not replaced) to display aggregation summaries.

126
specs/004-k1-scan-import/quickstart.md

@ -0,0 +1,126 @@
# Quickstart: K-1 PDF Scan Import
**Phase 1 Output** | **Date**: 2026-03-18 | **Updated**: 2026-03-18 (post-clarification)
## Prerequisites
1. Spec 001-family-office-transform models are implemented (Entity, Partnership, PartnershipMembership, KDocument, Distribution, Document)
2. At least one Partnership with one or more member Entities exists in the database
3. The existing upload infrastructure (`UploadController`, `uploads/` directory) is functional
4. Node.js ≥ 22.18.0, Docker for PostgreSQL/Redis
## Environment Setup
Add to `.env` (optional — for Azure OCR of scanned PDFs):
```
AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT=https://your-resource.cognitiveservices.azure.com/
AZURE_DOCUMENT_INTELLIGENCE_KEY=your-api-key
```
If these are empty, scanned PDFs fall back to `tesseract.js` (lower accuracy but fully self-hosted).
## New Dependencies
```bash
npm install pdf-parse @azure/ai-form-recognizer tesseract.js
npm install -D @types/pdf-parse
```
## Database Migration
After adding the new Prisma models (`K1ImportSession`, `CellMapping`, `CellAggregationRule`, `K1ImportStatus` enum):
```bash
npx prisma db push # Development: sync schema
# OR
npx prisma migrate dev # Create a migration file
```
Seed the default IRS cell mappings (28 rows with partnershipId = null) and default aggregation rules (e.g., "Total Ordinary Income", "Total Capital Gains", "Total Deductions") via the existing seed mechanism or a dedicated seed script.
## Key Files to Create
### Backend (apps/api/src/)
```
app/k1-import/
├── k1-import.module.ts # NestJS module
├── k1-import.controller.ts # REST endpoints (see contracts/k1-import-api.md)
├── k1-import.service.ts # Orchestration: upload → extract → verify → confirm
├── dto/
│ ├── upload-k1.dto.ts # Multipart upload DTO
│ ├── verify-k1.dto.ts # Verification submission DTO
│ └── confirm-k1.dto.ts # Confirmation request DTO
├── extractors/
│ ├── k1-extractor.interface.ts # Common extraction interface
│ ├── pdf-parse-extractor.ts # Tier 1: digital PDF text extraction
│ ├── azure-extractor.ts # Tier 2: Azure Document Intelligence
│ └── tesseract-extractor.ts # Tier 2 fallback: tesseract.js OCR
├── k1-field-mapper.service.ts # Maps raw extraction → K1ExtractedField[]
├── k1-allocation.service.ts # Allocates K-1 amounts to members by ownership %
├── k1-confidence.service.ts # Computes confidence scores with validation heuristics
└── k1-aggregation.service.ts # Dynamically computes aggregation summaries from rules
app/cell-mapping/
├── cell-mapping.module.ts # NestJS module
├── cell-mapping.controller.ts # CRUD for cell mappings + aggregation rules
└── cell-mapping.service.ts # Cell mapping + aggregation rule business logic + seed data
```
### Shared Types (libs/common/src/lib/)
```
interfaces/
├── k1-import.interface.ts # K1ExtractionResult, K1ExtractedField, K1ConfirmationRequest
dtos/
├── k1-import/
│ ├── create-k1-import.dto.ts
│ ├── verify-k1-import.dto.ts
│ └── confirm-k1-import.dto.ts
```
### Frontend (apps/client/src/app/)
```
pages/k1-import/
├── k1-import-page.component.ts # Upload + history view
├── k1-import-page.html
├── k1-import-page.scss
├── k1-import-page.routes.ts
├── k1-verification/
│ ├── k1-verification.component.ts # Verification/edit screen (mapped + unmapped + aggregations)
│ ├── k1-verification.html
│ └── k1-verification.scss
└── k1-confirmation/
├── k1-confirmation.component.ts # Confirmation result screen
├── k1-confirmation.html
└── k1-confirmation.scss
pages/cell-mapping/
├── cell-mapping-page.component.ts # Cell mapping + aggregation rule configuration UI
├── cell-mapping-page.html
└── cell-mapping-page.routes.ts
services/
├── k1-import-data.service.ts # HTTP client for k1-import endpoints
```
## Verification Workflow
1. **Upload**: User selects PDF → `POST /api/v1/k1-import/upload` → session created with status PROCESSING
2. **Extract**: Backend detects PDF type (digital vs. scanned) → routes to appropriate extractor → status becomes EXTRACTED
3. **Review**: Frontend polls/fetches session → displays verification screen with:
- **Mapped cells**: extracted fields with confidence indicators. High-confidence values are pre-accepted. Medium/low-confidence values require explicit review (acknowledge or edit).
- **Unmapped items**: separate section for values that didn't match any cell. User assigns to a cell or discards.
- **Aggregation summaries**: dynamically computed from mapped values using aggregation rules. Recalculate live when cell values are edited.
4. **Verify**: User reviews all medium/low fields and resolves unmapped items → `PUT /api/v1/k1-import/:id/verify` → status becomes VERIFIED
5. **Confirm**: User clicks "Confirm & Save" → `POST /api/v1/k1-import/:id/confirm` → KDocument + Distributions + Document created → status becomes CONFIRMED
## Testing Strategy
- **Unit tests**: Extractors (pdf-parse, azure, tesseract), field mapper, confidence scoring, allocation math, aggregation computation
- **Integration tests**: Full upload → extract → verify → confirm flow with test PDF fixtures
- **Test fixtures**: Include sample K-1 PDFs (digital and scanned) in `test/import/` directory
- **Allocation accuracy**: Verify rounding behavior — allocated amounts must sum exactly to partnership total
- **Aggregation tests**: Verify dynamic computation from rules, auto-recalculation on value edit, behavior when source cells are empty
- **Review enforcement**: Verify confirmation blocked when medium/low-confidence fields not reviewed or unmapped items unresolved

205
specs/004-k1-scan-import/research.md

@ -0,0 +1,205 @@
# Research: K-1 PDF Scan Import
**Phase 0 Output** | **Date**: 2026-03-18
## Decision 1: PDF Text Extraction (Tier 1 — Digital PDFs)
**Decision**: Use `pdf-parse` npm package for digitally-generated K-1 PDFs.
**Rationale**: Digitally-generated PDFs from fund administrators contain embedded text. `pdf-parse` extracts this text losslessly, is free, fully self-hosted, and instant. It has 3M+ weekly npm downloads and a stable API. No external API calls needed.
**Alternatives Considered**:
- `pdfjs-dist` (Mozilla pdf.js) — lower-level, requires more boilerplate for text extraction; `pdf-parse` wraps this already.
- Cloud OCR for all PDFs — unnecessary cost and latency for digital PDFs where text extraction is 100% accurate.
---
## Decision 2: OCR for Scanned PDFs (Tier 2)
**Decision**: Use Azure AI Document Intelligence (Layout model) as primary Tier 2 provider, with `tesseract.js` as self-hosted fallback.
**Rationale**:
- Azure has the best tax-form pedigree among cloud providers (prebuilt IRS models for W-2, 1098, 1099)
- Returns per-field confidence scores (0.0–1.0) natively, directly fulfilling FR-006/FR-009
- 500 free pages/month covers typical family office volume (10–50 K-1s/year)
- `@azure/ai-form-recognizer` has full TypeScript types, aligns with NestJS patterns
- `tesseract.js` runs as WASM in Node.js (no system install), provides ~75% accuracy fallback
**Alternatives Considered**:
- Google Document AI — good form parsing but no tax-specific models, more expensive for custom processors ($30/1K pages)
- AWS Textract — strong table extraction but less established for tax forms, requires IAM setup
- Tesseract.js only — accuracy drops to 70–85% for clean scans, no layout understanding; acceptable as fallback but not primary
---
## Decision 3: Two-Tier Extraction Architecture
**Decision**: Implement a PDF type detection step that routes digital PDFs to local extraction (free, instant) and scanned PDFs to cloud OCR.
**Rationale**: Most K-1s from fund administrators are digitally generated. The two-tier approach avoids unnecessary API calls and costs for the majority case, while still supporting scanned documents.
**Detection heuristic**: Extract text via `pdf-parse`; if extracted text length < 100 characters or does not contain K-1 keywords ("Schedule K-1", "Form 1065", "Partner's Share"), route to Tier 2 OCR.
**Alternatives Considered**:
- Cloud OCR for everything — simpler but adds cost ($0.15/page) and latency (3–10s) for digital PDFs that don't need it
- Local OCR only (Tesseract.js) — insufficient accuracy (75%) for production tax data; too many manual corrections needed
---
## Decision 4: K-1 Box Extraction Strategy
**Decision**: Use regex-based box extraction for Tier 1 (digital text), and key-value pair extraction from the OCR provider for Tier 2. Both feed into a shared K-1 field mapper that applies the cell mapping configuration.
**Rationale**: The IRS Schedule K-1 (Form 1065) has a consistent, standardized layout:
- Page 1: Header + Part I (partnership info) + Part II (partner info) + Boxes 1–11
- Page 2: Boxes 12–20+ with code/sub-code details
- Box values sit in a numbered two-column grid: number label → description → value field
- Layout has been structurally stable for years, making template/regex extraction reliable
**Challenges addressed**:
- Multi-line sub-codes (Boxes 11, 13, 15, 16, 17, 18, 20) — handle by extracting code-letter/value pairs within each box section
- Supplemental schedules — out of scope for V1 auto-extraction; captured as additional Document attachments
- Multi-entity PDFs — detect via repeated "Schedule K-1" headers; split and process each K-1 separately
**Alternatives Considered**:
- Fixed coordinate-based extraction — too brittle across different PDF generators (varying margins, fonts)
- Machine learning model — overkill for V1 given the standardized form layout
---
## Decision 5: Confidence Scoring Approach
**Decision**: Three-level confidence display (High/Medium/Low) derived from extraction method and validation heuristics.
**Rationale**:
For **Tier 1** (digital text):
- Base confidence: 0.90 (text extraction is inherently reliable)
- +0.05 if box number regex matched cleanly
- +0.05 if value format validated (currency, percentage, integer)
- -0.10 to -0.30 for potential adjacent-box text contamination
For **Tier 2** (cloud OCR):
- Use Azure's native per-field confidence score directly
- Layer cross-field validation (e.g., Box 6b ≤ Box 6a, sub-boxes sum to parent)
**Display mapping**:
- High (≥ 0.85): Green — no user attention needed
- Medium (0.60–0.84): Yellow — optional review
- Low (< 0.60): Red highlighted, requires manual review (FR-009)
**Alternatives Considered**:
- Binary confidence (confident/not) — too coarse; doesn't guide the user's review attention
- Numeric score display — too technical for a non-engineer user; three levels with color coding is more actionable
---
## Decision 6: New Database Models
**Decision**: Add two new Prisma models (`K1ImportSession`, `CellMapping`) to support import tracking and cell mapping configuration, alongside the existing K-document models from spec 001.
**Rationale**:
- `K1ImportSession` tracks the full import lifecycle (upload → processing → extracted → verified → confirmed/cancelled), enabling import history (FR-022) and re-processing (FR-023)
- `CellMapping` stores per-partnership cell label customizations (FR-017 through FR-021) separate from the KDocument data itself
**Alternatives Considered**:
- Store import sessions as JSON metadata on KDocument — would conflate document data with import workflow state; makes import history harder to query
- Store cell mappings as JSON on Partnership — would work but loses the ability to query/manage mappings independently and doesn't support a global default set
---
## Decision 7: File Storage
**Decision**: Use the existing `uploads/` directory and `Document` model from spec 001. Uploaded K-1 PDFs are stored on the local filesystem, with metadata in the `Document` table.
**Rationale**: The existing upload infrastructure (UploadController with `FileInterceptor`, Document model, `uploads/` directory) is already in place. No need to add a new storage mechanism.
**Alternatives Considered**:
- S3/cloud storage — would require new infrastructure; the self-hosted philosophy favors local storage
- Database blob storage — increases database size and backup time for binary files
---
## Decision 8: New Environment Variables
**Decision**: Add two optional environment variables for Azure Document Intelligence, following the existing `ConfigurationService` pattern with `str({ default: '' })`.
```
AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT — Azure resource endpoint URL
AZURE_DOCUMENT_INTELLIGENCE_KEY — Azure API key
```
**Rationale**: When both are empty (default), the system falls back to `tesseract.js` for scanned PDFs. This makes Azure optional — the feature works fully self-hosted with degraded OCR accuracy.
**Alternatives Considered**:
- Separate feature flag — unnecessary; empty credentials are sufficient to indicate "not configured"
- Google/AWS credentials — Azure recommended as primary; could add additional providers later
---
## Decision 9: New npm Dependencies
**Decision**: Add the following packages:
| Package | Purpose | Tier |
|---|---|---|
| `pdf-parse` | Text extraction from digital PDFs | Tier 1 (required) |
| `@azure/ai-form-recognizer` | Cloud OCR for scanned PDFs | Tier 2 (optional) |
| `tesseract.js` | Self-hosted OCR fallback | Tier 2 fallback |
**Rationale**: `pdf-parse` is essential for the Tier 1 (free, local) path. Azure SDK is optional (only loaded when credentials are configured). `tesseract.js` provides a zero-config fallback that runs as WASM — no system dependencies needed, works in the existing `node:22-slim` Docker image.
**Alternatives Considered**:
- `pdfjs-dist` directly instead of `pdf-parse` — more boilerplate, `pdf-parse` wraps it with a simpler API
- Only cloud OCR — loses the self-hosted story and adds cost for digital PDFs
---
## Decision 10: Cell Aggregation Rules — Dynamic Computation
**Decision**: Persist only aggregation rule definitions (name, source cells, operation). Compute totals dynamically from raw K-1 box values at display time. Do NOT store computed totals.
**Rationale**:
- K-1 values can change during the import lifecycle (estimated → final transitions, manual edits after confirmation)
- Storing computed totals creates a denormalization risk — stale aggregates when underlying values change
- Computation is trivial (summing a handful of numbers) with no performance concern at family office scale
- Keeps a single source of truth: the raw box values in K1Data
- Aggregation rules are displayed on both the verification screen (FR-033) and KDocument detail view (FR-036)
**Alternatives Considered**:
- Persist computed totals alongside raw data — creates stale data risk, requires update triggers
- Persist both (snapshot + live) for audit — adds complexity V1 doesn't need; audit trail exists in import session history
---
## Decision 11: Unmapped Items Handling
**Decision**: Display extracted values that don't match any configured cell mapping in a separate "Unmapped Items" section on the verification screen. Administrator can assign to an existing cell, create a new custom cell, or discard.
**Rationale**:
- OCR/extraction may pull supplemental schedule items, footnotes, state-specific addenda
- Silently discarding loses potentially important data
- Auto-creating cells for every unmatched value creates noise
- Explicit user decision preserves data integrity while keeping mapped cells clean
- Assigned unmapped items update the cell mapping for future imports (learning effect)
**Alternatives Considered**:
- Silent discard — loses data, violates user's expectation of completeness
- Auto-create custom cells — too noisy; PDF footnotes and headers would create junk cells
---
## Decision 12: Verification Auto-Accept Strategy
**Decision**: Auto-accept (pre-check) high-confidence values on the verification screen. Require explicit review (acknowledge or edit) for medium and low-confidence values before allowing confirmation.
**Rationale**:
- V1 is "partially manual, partially automated" per user intent
- High-confidence values (≥ 0.85) from digital PDFs are reliably accurate (90%+ per SC-002)
- Forcing explicit review of every cell wastes time on correct values
- Blocking confirmation until medium/low-confidence fields are reviewed catches the errors
- All values remain visible and editable — user can override any pre-accepted value
**Alternatives Considered**:
- Every cell requires explicit accept — too slow for 15+ fields, doesn't match "partially automated" intent
- Spot-check model (everything auto-accepted) — too risky for tax data; OCR errors would go unreviewed

220
specs/004-k1-scan-import/spec.md

@ -0,0 +1,220 @@
# Feature Specification: Automated K-1 PDF Scanning & Model Object Creation
**Feature Branch**: `004-k1-scan-import`
**Created**: 2026-03-18
**Status**: Draft
**Input**: User description: "Automated K-1 PDF scanning and model object creation with cell mapping verification. User sets up Entities and Partnerships manually, then scans a K-1 PDF document and the system auto-creates model objects (KDocument, Distributions, etc.) based on K-1 cell mapping. V1 includes manual verification of mapped values with cell name override capability."
## User Scenarios & Testing _(mandatory)_
### User Story 1 - Upload & Scan K-1 PDF (Priority: P1)
A family office administrator has already set up their Entities and Partnerships in the system. They receive a K-1 PDF (Schedule K-1, Form 1065) from a fund administrator. They navigate to the partnership's K-1 section, upload the PDF, and the system extracts structured data from the document using OCR/document intelligence. The extracted values are mapped to the standard K-1 box numbers (Box 1 through Box 19) and presented to the administrator for review before saving.
**Why this priority**: This is the core value proposition — eliminating manual K-1 data entry. Without PDF scanning and extraction, the administrator must type every value by hand, which is the exact pain point this feature solves.
**Independent Test**: Can be fully tested by uploading a sample K-1 PDF for an existing partnership, verifying that extracted values appear in a review screen mapped to the correct box numbers, and confirming the extraction completes without errors.
**Acceptance Scenarios**:
1. **Given** an existing partnership with at least one member entity, **When** the administrator uploads a K-1 PDF file, **Then** the system processes the document and displays extracted values mapped to K-1 box numbers within 30 seconds.
2. **Given** a K-1 PDF in standard IRS format, **When** the system extracts data, **Then** the partner name, partnership name, EIN, tax year, and all populated box values (Box 1 through Box 19) are identified and displayed.
3. **Given** a scanned (image-based) K-1 PDF rather than a digitally-generated one, **When** the administrator uploads it, **Then** the system applies OCR and extracts values with the same box mapping.
4. **Given** a K-1 PDF where certain boxes are empty or contain zero, **When** the system extracts data, **Then** those boxes are displayed as zero/empty rather than showing incorrect values from adjacent cells.
5. **Given** a multi-page K-1 PDF (Form 1065 with supplemental schedules), **When** uploaded, **Then** the system extracts data from all relevant pages and consolidates into a single review view.
---
### User Story 2 - Review & Verify Extracted Values (Priority: P1)
After the K-1 PDF is scanned, the administrator sees a verification screen showing every extracted value alongside its mapped K-1 box/cell. Each row displays: the cell label (e.g., "Box 1 - Ordinary business income"), the extracted value, and a confidence indicator. The administrator can accept each value, edit it if the extraction was incorrect, or override the cell name/label if the mapping is wrong. The administrator must explicitly confirm before the data is saved to the system.
**Why this priority**: OCR is inherently imperfect. Without a verification step, incorrect values would silently corrupt K-1 records and tax data. This is the safety net that makes the feature trustworthy for V1.
**Independent Test**: Can be fully tested by scanning a K-1 PDF, modifying an extracted value and a cell label on the verification screen, confirming, and verifying the saved KDocument record reflects the corrections.
**Acceptance Scenarios**:
1. **Given** a scanned K-1 with extracted values, **When** the verification screen loads, **Then** each extracted value shows the box number, label, extracted amount, and a confidence score (high/medium/low). High-confidence values are pre-accepted; medium/low-confidence values require explicit review before confirmation is allowed.
2. **Given** the verification screen, **When** the administrator edits an extracted value (e.g., changes Box 1 from $50,000 to $52,000), **Then** the corrected value is reflected immediately and will be used when saving.
3. **Given** the verification screen, **When** the administrator overrides a cell name (e.g., renames "Box 11 - Other income" to "Box 11 - Section 1256 contracts"), **Then** the custom label is saved alongside the value.
4. **Given** a low-confidence extraction for a specific box, **When** the verification screen displays it, **Then** the field is visually highlighted to draw the administrator's attention for manual review.
5. **Given** all values have been reviewed on the verification screen, **When** the administrator clicks "Confirm & Save", **Then** the system creates a KDocument record with all verified values and links it to the partnership and tax year.
6. **Given** the verification screen, **When** the administrator clicks "Cancel" or navigates away, **Then** no data is saved and the uploaded PDF is discarded (or retained as an unprocessed document).
---
### User Story 3 - Auto-Create Model Objects from Verified K-1 Data (Priority: P1)
After the administrator confirms the verified K-1 data, the system automatically creates and updates the downstream model objects. Specifically: a KDocument is created with the structured box data and linked to the partnership and tax year; the K-1 amounts are allocated to each partnership member based on their ownership percentages; and Distribution records are created for Box 19 (distributions) entries. The uploaded PDF is saved as a Document record and linked to the KDocument.
**Why this priority**: This is the automation payoff — the reason for scanning in the first place. Without auto-creation of downstream objects, the user would still need to manually create KDocuments and allocate amounts, negating the benefit of scanning.
**Independent Test**: Can be fully tested by confirming verified K-1 data for a partnership with 2 members (60%/40% split), then verifying that a KDocument exists with correct box values, member allocations are computed correctly, Distribution records were created for Box 19, and the PDF Document is linked.
**Acceptance Scenarios**:
1. **Given** confirmed K-1 data for a partnership with two members (60%/40%), **When** the system processes the confirmation, **Then** a KDocument record is created with type K1, the correct tax year, filing status DRAFT, and all box values stored in the structured data field.
2. **Given** confirmed K-1 data with Box 19a (cash distributions) of $100,000, **When** the system creates downstream objects, **Then** Distribution records are created for each member: $60,000 for the 60% member and $40,000 for the 40% member, both with type RETURN_OF_CAPITAL and the appropriate date.
3. **Given** confirmed K-1 data, **When** the system allocates amounts, **Then** each member's allocated K-1 values (ordinary income, capital gains, deductions, etc.) match their ownership percentage of the partnership-level amounts.
4. **Given** the uploaded PDF file, **When** the KDocument is created, **Then** a Document record is created with type K1, the PDF file is stored, and the Document is linked to the KDocument via `documentFileId`.
5. **Given** a partnership that already has a KDocument for the same tax year and type (K1), **When** the administrator scans a new K-1 PDF for that same year, **Then** the system prompts whether to update the existing record or create a new version, and the existing record's status history is preserved.
---
### User Story 4 - Cell Mapping Configuration (Priority: P2)
The administrator can view and customize the K-1 cell mapping configuration that the system uses to extract and label values. The default mapping follows the standard IRS Schedule K-1 (Form 1065) layout, but the administrator can add custom cell labels, rename existing cells, or define custom extraction regions for non-standard K-1 formats. The configuration is saved per partnership or globally and reused for future K-1 imports for that partnership.
**Why this priority**: While the default IRS mapping covers most K-1s, some fund administrators use supplemental schedules or non-standard formats. Custom mapping ensures the system works across the administrator's full portfolio of partnerships. This is P2 because the default mapping handles the majority case.
**Independent Test**: Can be fully tested by modifying the cell mapping for a specific partnership (e.g., adding a custom "Box 20 - Section 199A" field), scanning a K-1 PDF, and verifying the custom field appears in the verification screen with extracted data.
**Acceptance Scenarios**:
1. **Given** the cell mapping configuration screen, **When** the administrator views it, **Then** all standard K-1 boxes (1-19) are listed with their IRS-defined labels and box numbers.
2. **Given** the cell mapping configuration, **When** the administrator adds a custom cell (e.g., "Box 20 - Qualified Business Income"), **Then** the custom cell is saved and included in future extraction results for that partnership.
3. **Given** a partnership with a custom cell mapping, **When** a K-1 PDF is scanned for that partnership, **Then** the extraction uses the custom mapping configuration instead of the global default.
4. **Given** a modified cell label (e.g., Box 11 renamed to "Section 1256 Contracts"), **When** the extraction runs, **Then** the renamed label is displayed on the verification screen alongside the extracted value.
5. **Given** the administrator wants to reset to defaults, **When** they select "Reset to IRS Default Mapping", **Then** all custom labels and additional cells are removed and the standard mapping is restored.
---
### User Story 5 - K-1 Import History & Re-Processing (Priority: P3)
The administrator can view a history of all K-1 imports for a partnership, including the date of import, the uploaded PDF, the extraction results, and whether the data was accepted or rejected. They can re-process a previously uploaded PDF (e.g., after updating the cell mapping) or re-upload a corrected PDF. The system maintains an audit trail of changes between estimated and final K-1 data.
**Why this priority**: Import history and re-processing are important for tax season workflows (estimated → final K-1 transitions) but depend on the core scan/verify/create pipeline being complete first.
**Independent Test**: Can be fully tested by scanning two K-1 PDFs for the same partnership/year (one estimated, one final), verifying both appear in the import history, and confirming the KDocument status transitions from ESTIMATED to FINAL with the updated values.
**Acceptance Scenarios**:
1. **Given** a partnership with multiple K-1 imports over time, **When** the administrator views the import history, **Then** each import is listed with date, filename, tax year, status (accepted/rejected), and current KDocument status (draft/estimated/final).
2. **Given** a previously imported K-1 PDF, **When** the administrator selects "Re-process", **Then** the system re-runs extraction using the current cell mapping configuration and shows the verification screen with updated results.
3. **Given** an existing KDocument with status ESTIMATED, **When** the administrator scans the final K-1 PDF and confirms, **Then** the KDocument status updates to FINAL and the values are updated, with the previous estimated values preserved in the change history.
4. **Given** a rejected import, **When** the administrator views it in history, **Then** they can see why it was rejected and can re-upload a corrected PDF to try again.
---
### Edge Cases
- What happens when the uploaded file is not a valid PDF? The system must reject the file with a clear error message before attempting extraction.
- What happens when the K-1 PDF is password-protected? The system must detect this and prompt the user to provide the password or upload an unprotected version.
- What happens when the OCR extraction returns zero values for all boxes? The system must warn the user that extraction may have failed and recommend checking if the PDF is readable.
- What happens when the extracted partner name or EIN does not match any existing entity in the system? The system must flag the mismatch on the verification screen and allow the user to manually select the correct entity/partnership.
- What happens when multiple K-1s are received for different members of the same partnership in a single multi-page PDF? The system must detect multiple K-1 forms within one document and process each separately.
- What happens when the extracted tax year does not match the expected year? The system must highlight the discrepancy and allow the user to confirm or override the tax year.
- What happens when ownership percentages have changed during the tax year? The system must use the ownership percentage as of the K-1's tax year end date for allocation calculations.
- What happens when the upload exceeds the maximum file size (e.g., large scanned documents)? The system must enforce a file size limit and provide a clear error message.
- What happens when extracted values don't match any configured cell mapping (e.g., footnotes, supplemental schedule items, state-specific addenda)? The system displays them in a separate "Unmapped Items" section on the verification screen where the administrator can assign them to a cell or discard them.
## Requirements _(mandatory)_
### Functional Requirements
**K-1 PDF Upload & Processing**
- **FR-001**: System MUST accept PDF file uploads (both digitally-generated and scanned/image-based) for K-1 document processing.
- **FR-002**: System MUST extract structured data from K-1 PDFs using document intelligence/OCR, identifying standard IRS Schedule K-1 (Form 1065) fields: partner information, partnership information, tax year, and box values (Box 1 through Box 19).
- **FR-003**: System MUST validate that uploaded files are valid PDFs before processing and reject non-PDF files with a clear error message.
- **FR-004**: System MUST handle both single-page and multi-page K-1 PDFs, including supplemental schedules.
- **FR-005**: System MUST complete extraction and present results within 30 seconds for a standard K-1 PDF.
**Verification & Manual Review**
- **FR-006**: System MUST display a verification screen after extraction showing each extracted value with its mapped K-1 box number, label, value, and confidence level (high/medium/low). High-confidence values MUST be pre-accepted (shown as checked/approved) by default.
- **FR-007**: System MUST allow the administrator to edit any extracted value before confirmation, including overriding pre-accepted high-confidence values.
- **FR-008**: System MUST allow the administrator to override/rename any cell label on the verification screen.
- **FR-009**: System MUST visually highlight medium and low-confidence extractions as requiring explicit review. Low-confidence fields MUST be flagged with a warning indicator; medium-confidence fields MUST be flagged with a review indicator.
- **FR-010**: System MUST require explicit confirmation ("Confirm & Save") before creating any model objects from extracted data.
- **FR-035**: System MUST NOT allow confirmation until all medium and low-confidence fields have been explicitly reviewed (acknowledged or edited) by the administrator. High-confidence fields do not require explicit review but remain editable.
- **FR-037**: When extracted data contains values that do not match any configured cell in the mapping, the system MUST display them in a separate "Unmapped Items" section on the verification screen, distinct from the mapped cells.
- **FR-038**: For each unmapped item, the administrator MUST be able to either assign it to an existing or new cell mapping, or explicitly discard it. Discarded unmapped items are not persisted to the KDocument.
- **FR-011**: System MUST allow the administrator to cancel/discard the extraction without saving any data.
**Automatic Model Object Creation**
- **FR-012**: Upon confirmation, system MUST create a KDocument record with the verified box values, linked to the correct partnership and tax year, with filing status DRAFT.
- **FR-013**: Upon confirmation, system MUST allocate K-1 line item amounts to each partnership member based on their active ownership percentage as of the tax year end date.
- **FR-014**: Upon confirmation, system MUST create Distribution records for Box 19a (cash distributions) and Box 19b (property distributions) amounts, allocated proportionally to each member.
- **FR-015**: Upon confirmation, system MUST create a Document record for the uploaded PDF with type K1 and link it to the KDocument via `documentFileId`.
- **FR-016**: When a KDocument already exists for the same partnership, type, and tax year, system MUST prompt whether to update the existing record or reject the import, preserving the previous data for audit purposes.
**Cell Mapping Configuration**
- **FR-017**: System MUST provide a default cell mapping based on the standard IRS Schedule K-1 (Form 1065) box layout (Box 1 through Box 19 with IRS-defined labels).
- **FR-018**: System MUST allow the administrator to add custom cells beyond the standard K-1 boxes (e.g., Box 20 for QBI, state-specific items).
- **FR-019**: System MUST allow the administrator to rename/relabel any cell in the mapping.
- **FR-020**: System MUST support saving cell mapping configurations per partnership for reuse across tax years.
- **FR-021**: System MUST allow resetting a partnership's cell mapping to the IRS default.
**Cell Aggregation Rules**
- **FR-030**: System MUST allow the administrator to define aggregation rules that combine multiple K-1 cells into computed summary values (e.g., "Total Ordinary Income" = Box 1, "Total Capital Gains" = Box 8a + Box 9a + Box 10).
- **FR-031**: Each aggregation rule MUST specify a name, a list of source cell references (box numbers), and an aggregation operation (sum for V1; additional operations deferred).
- **FR-032**: Aggregation rules MUST be saved per partnership (or globally) alongside cell mappings and reused across tax years.
- **FR-033**: On the verification screen, computed summary values MUST be displayed alongside individual cell values, clearly distinguished as derived/aggregated rather than directly extracted.
- **FR-034**: When an individual cell value is edited during verification, any aggregation that includes that cell MUST automatically recalculate.
- **FR-036**: On the KDocument detail view, aggregated summary values MUST be displayed alongside the raw box values. The summary section shows each named aggregation rule with its computed total, so the administrator can reference combined values (e.g., "Income Summary", "Total Capital Gains") when reviewing any K-1 record after import.
- **FR-039**: Aggregation summary values MUST be computed dynamically from the raw box values each time they are displayed (on the verification screen and KDocument detail view). Only the aggregation rules (name, source cells, operation) are persisted — not the computed totals. This ensures summaries auto-update when underlying box values change (e.g., during estimated-to-final K-1 transitions).
**Import History & Audit**
- **FR-022**: System MUST maintain a history of all K-1 import attempts per partnership, including upload date, filename, tax year, and outcome (accepted/cancelled).
- **FR-023**: System MUST support re-processing a previously uploaded K-1 PDF with the current cell mapping configuration.
- **FR-024**: System MUST support KDocument status transitions (DRAFT → ESTIMATED → FINAL) when re-importing updated K-1s for the same partnership and tax year.
- **FR-025**: System MUST preserve previous K-1 values when updating from estimated to final, maintaining an audit trail.
**Validation & Error Handling**
- **FR-026**: System MUST validate that the extracted or user-selected partnership exists in the system before creating model objects.
- **FR-027**: System MUST validate that the partnership has active members before attempting allocation.
- **FR-028**: System MUST enforce a maximum file size limit for uploaded PDFs and communicate the limit clearly.
- **FR-029**: System MUST detect password-protected PDFs and prompt the user to provide an unprotected version.
### Key Entities
- **K1ImportSession**: A record of a single K-1 PDF import attempt. Tracks the uploaded file, extraction status (processing/extracted/verified/confirmed/cancelled), raw extraction results, verified results after user edits, and the resulting KDocument if confirmed. Linked to a Partnership and a User. Enables import history and re-processing.
- **CellMapping**: A configuration defining how K-1 box numbers map to labels and extraction regions. Has a default IRS-standard set and supports per-partnership customization. Key attributes: box number, label, description, custom flag, partnership (optional — null means global default).
- **CellAggregationRule**: A named rule that combines multiple CellMapping entries into a computed summary value. Key attributes: name (e.g., "Income Summary"), source cell references (list of box numbers), aggregation operation (SUM for V1), partnership (optional — null means global default). Computed totals are NOT persisted — they are derived dynamically from raw box values each time. Displayed on the verification screen and KDocument detail view as derived rows.
- **KDocument** (existing from 001-family-office-transform): Extended to be auto-created from verified scan data rather than only manual entry. The structured `data` JSON field stores the K1Data interface values.
- **Distribution** (existing from 001-family-office-transform): Auto-created from Box 19 data during K-1 import confirmation, allocated to members by ownership percentage.
- **Document** (existing from 001-family-office-transform): Created automatically for the uploaded K-1 PDF and linked to the KDocument.
## Success Criteria _(mandatory)_
### Measurable Outcomes
- **SC-001**: Administrator can upload a K-1 PDF and see extracted values on the verification screen within 30 seconds of upload.
- **SC-002**: For a digitally-generated (non-scanned) K-1 PDF in standard IRS format, the system correctly extracts at least 90% of populated box values without manual correction.
- **SC-003**: For a scanned (image-based) K-1 PDF, the system correctly extracts at least 75% of populated box values without manual correction.
- **SC-004**: Administrator can review, edit, and confirm all extracted values in under 5 minutes for a standard K-1 with 10-15 populated boxes.
- **SC-005**: After confirmation, all downstream model objects (KDocument, Document, Distributions, member allocations) are created within 5 seconds.
- **SC-006**: Member allocation amounts are accurate to the cent — matching each member's ownership percentage multiplied by the partnership-level K-1 values.
- **SC-007**: The complete K-1 import workflow (upload → extract → verify → confirm) saves at least 70% of the time compared to manual data entry for the same K-1 data.
- **SC-008**: Re-processing a previously uploaded K-1 PDF produces results within 30 seconds and shows the updated extraction on the verification screen.
- **SC-009**: Cell mapping customizations persist across sessions and are correctly applied to subsequent K-1 imports for the same partnership.
- **SC-010**: Import history accurately reflects all import attempts, with correct status and links to resulting KDocuments.
## Clarifications
### Session 2026-03-18
- Q: How should data points be combined — fixed IRS categories, custom administrator-defined aggregations, or cross-partnership rollup? → A: Each K-1 box maps 1:1 to a data point, AND the administrator can define custom aggregation rules that combine multiple cells into computed summary values (e.g., "Income Summary" = Box 1 + Box 2 + Box 3). Both individual cell values and aggregated summaries are displayed.
- Q: Should the verification screen require explicit review of every cell, or auto-accept high-confidence values? → A: Auto-accept high-confidence values (pre-checked); user must explicitly review medium/low-confidence fields only. High-confidence values remain visible and editable but do not block confirmation.
- Q: Where should aggregated summary values be visible after import confirmation? → A: On the KDocument detail view, alongside raw box values. The administrator can always reference combined values when viewing any K-1 record, not just during import.
- Q: What happens with extracted data that doesn't match any configured cell mapping? → A: Show unmatched values in a separate "Unmapped Items" section on the verification screen. Administrator can assign them to a cell or discard. No silent data loss.
- Q: Should aggregation summary values be persisted or computed dynamically? → A: Computed dynamically from raw box values each time. Only the aggregation rules are persisted, not the totals. This ensures summaries auto-update when underlying values change (e.g., estimated→final transitions).
## Assumptions
- Entities and Partnerships are already created in the system before K-1 import is attempted. This feature does not create entities or partnerships — only KDocuments, Distributions, Documents, and member allocations.
- The existing KDocument, Distribution, Document, and PartnershipMembership models from spec 001-family-office-transform are implemented and available.
- K-1 PDFs follow one of two formats: digitally-generated (text-based PDF with selectable text) or scanned (image-based PDF requiring OCR). Both must be supported.
- The standard IRS Schedule K-1 (Form 1065) layout is the baseline. K-1s from Form 1041 (trusts/estates) and Form 1120-S (S-corps) are out of scope for V1.
- Document intelligence/OCR processing is handled by a third-party service or library. The specific provider is an implementation detail.
- File storage for uploaded PDFs uses the existing Document storage mechanism from spec 001.
- Confidence scores for extractions are derived from the OCR/document intelligence provider's confidence metrics.
- The administrator has a single active browser session; concurrent K-1 imports for the same partnership are not a V1 requirement.
- Exchange rate handling for K-1 amounts follows the existing currency conversion approach — K-1 values are always in USD as they are IRS tax documents.

282
specs/004-k1-scan-import/tasks.md

@ -0,0 +1,282 @@
# Tasks: K-1 PDF Scan Import
**Input**: Design documents from `/specs/004-k1-scan-import/`
**Prerequisites**: plan.md (required), spec.md (required for user stories), research.md, data-model.md, contracts/
**Tests**: Not explicitly requested — test tasks are excluded. Test fixture PDFs are included in Polish phase for manual/integration validation.
**Organization**: Tasks are grouped by user story to enable independent implementation and testing of each story.
## Format: `[ID] [P?] [Story] Description`
- **[P]**: Can run in parallel (different files, no dependencies)
- **[Story]**: Which user story this task belongs to (e.g., US1, US2, US3)
- Include exact file paths in descriptions
## Path Conventions
- **Backend**: `apps/api/src/app/`
- **Frontend**: `apps/client/src/app/`
- **Shared library**: `libs/common/src/lib/`
- **Database schema**: `prisma/schema.prisma`
- **Config**: `apps/api/src/app/configuration/configuration.service.ts`
---
## Phase 1: Setup (Shared Infrastructure)
**Purpose**: Install dependencies, define database schema, create shared types and configuration
- [X] T001 Install npm dependencies: pdf-parse, @azure/ai-form-recognizer, tesseract.js, @types/pdf-parse in package.json
- [X] T002 [P] Add K1ImportStatus enum (PROCESSING, EXTRACTED, VERIFIED, CONFIRMED, CANCELLED, FAILED), K1ImportSession model, CellMapping model, and CellAggregationRule model to prisma/schema.prisma
- [X] T003 [P] Create shared K-1 TypeScript interfaces (K1ExtractionResult, K1ExtractedField, K1UnmappedItem, K1ConfirmationRequest) in libs/common/src/lib/interfaces/k1-import.interface.ts
- [X] T004 [P] Register AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT and AZURE_DOCUMENT_INTELLIGENCE_KEY environment variables in apps/api/src/app/configuration/configuration.service.ts
- [X] T005 Run Prisma migration to create K-1 import tables and K1ImportStatus enum
- [X] T006 [P] Create shared DTOs (CreateK1ImportDto, VerifyK1ImportDto, ConfirmK1ImportDto) in libs/common/src/lib/dtos/k1-import/
---
## Phase 2: Foundational (Blocking Prerequisites)
**Purpose**: Core module scaffolding, extractor interface, seed data, and frontend plumbing that MUST be complete before ANY user story can be implemented
**⚠️ CRITICAL**: No user story work can begin until this phase is complete
- [X] T007 Create K1Import NestJS module skeleton (module, empty controller, empty service) in apps/api/src/app/k1-import/k1-import.module.ts
- [X] T008 [P] Create CellMapping NestJS module skeleton (module, empty controller, empty service) in apps/api/src/app/cell-mapping/cell-mapping.module.ts
- [X] T009 Register K1ImportModule and CellMappingModule in apps/api/src/app/app.module.ts
- [X] T010 Create K1 extractor interface (K1Extractor with extract method returning K1ExtractionResult) in apps/api/src/app/k1-import/extractors/k1-extractor.interface.ts
- [X] T011 Implement cell mapping seed logic (28 IRS default rows + 3 default aggregation rules) in apps/api/src/app/cell-mapping/cell-mapping.service.ts
- [X] T012 [P] Create K1 import frontend data service (HTTP client for all k1-import and cell-mapping endpoints) in apps/client/src/app/services/k1-import-data.service.ts
- [X] T013 [P] Create frontend route configurations for K1 import pages in apps/client/src/app/pages/k1-import/k1-import-page.routes.ts and cell mapping pages in apps/client/src/app/pages/cell-mapping/cell-mapping-page.routes.ts
**Checkpoint**: Foundation ready — user story implementation can now begin in parallel
---
## Phase 3: User Story 1 — Upload & Scan K-1 PDF (Priority: P1) 🎯 MVP
**Goal**: Administrator uploads a K-1 PDF for a partnership, the system extracts structured data using two-tier extraction (pdf-parse for digital, Azure/tesseract for scanned), and presents results mapped to K-1 box numbers
**Independent Test**: Upload a sample K-1 PDF for an existing partnership, verify extracted values appear on the review screen mapped to correct box numbers within 30 seconds
### Implementation for User Story 1
- [X] T014 [P] [US1] Implement pdf-parse extractor (Tier 1 — digital PDFs with regex-based box extraction) in apps/api/src/app/k1-import/extractors/pdf-parse-extractor.ts
- [X] T015 [P] [US1] Implement Azure Document Intelligence extractor (Tier 2 — scanned PDFs with key-value pair extraction) in apps/api/src/app/k1-import/extractors/azure-extractor.ts
- [X] T016 [P] [US1] Implement tesseract.js OCR extractor (Tier 2 fallback — self-hosted scanned PDF extraction) in apps/api/src/app/k1-import/extractors/tesseract-extractor.ts
- [X] T017 [P] [US1] Implement K1 confidence scoring service (three-level HIGH/MEDIUM/LOW with validation heuristics per research.md Decision 5) in apps/api/src/app/k1-import/k1-confidence.service.ts
- [X] T018 [US1] Implement K1 field mapper service (raw extraction → K1ExtractedField[] using cell mapping configuration, PDF type detection heuristic) in apps/api/src/app/k1-import/k1-field-mapper.service.ts
- [X] T019 [P] [US1] Create upload DTO (file, partnershipId, taxYear with validation decorators) in apps/api/src/app/k1-import/dto/upload-k1.dto.ts
- [X] T020 [US1] Implement K1 import service upload and extraction orchestration (PDF validation FR-003/FR-028, type detection, extractor routing, session lifecycle) in apps/api/src/app/k1-import/k1-import.service.ts
- [X] T021 [US1] Implement K1 import controller with POST /api/v1/k1-import/upload (multipart) and GET /api/v1/k1-import/:id endpoints in apps/api/src/app/k1-import/k1-import.controller.ts
- [X] T022 [US1] Create K1 import page component (PDF upload UI with partnership selector, tax year input, upload progress, extraction status polling) in apps/client/src/app/pages/k1-import/k1-import-page.component.ts
**Checkpoint**: At this point, User Story 1 should be fully functional — PDF upload triggers extraction and results are retrievable via GET /:id
---
## Phase 4: User Story 2 — Review & Verify Extracted Values (Priority: P1)
**Goal**: Administrator reviews extracted values with confidence indicators, edits incorrect values, resolves unmapped items, views aggregation summaries, and submits verified data. High-confidence values are auto-accepted; medium/low require explicit review before confirmation is allowed.
**Independent Test**: Scan a K-1 PDF, modify an extracted value and a cell label on the verification screen, resolve an unmapped item, confirm all medium/low fields are reviewed, and verify the session transitions to VERIFIED status with corrections saved
### Implementation for User Story 2
- [X] T023 [P] [US2] Create verify DTO (fields array with isReviewed flags, unmappedItems array with resolution status, taxYear override) in apps/api/src/app/k1-import/dto/verify-k1.dto.ts
- [X] T024 [US2] Implement verification logic in K1 import service (EXTRACTED → VERIFIED transition, enforce all medium/low-confidence isReviewed=true per FR-035, validate all unmapped items resolved per validation rule 10) in apps/api/src/app/k1-import/k1-import.service.ts
- [X] T025 [US2] Implement cancel logic in K1 import service (status → CANCELLED, discard extraction data per FR-011) in apps/api/src/app/k1-import/k1-import.service.ts
- [X] T026 [US2] Add PUT /api/v1/k1-import/:id/verify and POST /api/v1/k1-import/:id/cancel endpoints to apps/api/src/app/k1-import/k1-import.controller.ts
- [X] T027 [P] [US2] Implement K1 aggregation service (dynamic SUM computation from CellAggregationRule records, auto-recalculate when cell values change per FR-034/FR-039) in apps/api/src/app/k1-import/k1-aggregation.service.ts
- [X] T028 [US2] Create K1 verification component with mapped cells table (box number, label, value, confidence indicator, inline edit, isReviewed checkbox, custom label override) in apps/client/src/app/pages/k1-import/k1-verification/k1-verification.component.ts
- [X] T029 [US2] Add unmapped items section to verification view (assign to existing/new cell or discard, with resolution tracking per FR-037/FR-038) in apps/client/src/app/pages/k1-import/k1-verification/k1-verification.html
- [X] T030 [US2] Add aggregation summary display to verification view (derived rows distinguished from extracted values, live recalculation on cell edit per FR-033/FR-034) in apps/client/src/app/pages/k1-import/k1-verification/k1-verification.html
- [X] T031 [US2] Implement review enforcement UI (disable Confirm button until all medium/low-confidence fields have isReviewed=true AND all unmapped items resolved per FR-035) in apps/client/src/app/pages/k1-import/k1-verification/k1-verification.component.ts
**Checkpoint**: At this point, User Stories 1 AND 2 should both work — upload → extract → verify flow is complete
---
## Phase 5: User Story 3 — Auto-Create Model Objects from Verified K-1 Data (Priority: P1)
**Goal**: After confirmation, system auto-creates KDocument with verified box values, allocates K-1 amounts to partnership members by ownership percentage, creates Distribution records for Box 19, and links the uploaded PDF as a Document record
**Independent Test**: Confirm verified K-1 data for a partnership with 2 members (60%/40% split), verify KDocument exists with correct box values, member allocations are proportional, Distribution records exist for Box 19, and the PDF Document is linked
### Implementation for User Story 3
- [X] T032 [P] [US3] Create confirm DTO (filingStatus, existingKDocumentAction) in apps/api/src/app/k1-import/dto/confirm-k1.dto.ts
- [X] T033 [US3] Implement K1 allocation service (allocate line items to members by ownership % as of tax year end, rounding adjustment on largest member per validation rule 8) in apps/api/src/app/k1-import/k1-allocation.service.ts
- [X] T034 [US3] Implement confirmation logic in K1 import service (create KDocument with type K1 and verified box values, create Distribution records for Box 19a/19b, create Document record for PDF, link all records per FR-012 through FR-015) in apps/api/src/app/k1-import/k1-import.service.ts
- [X] T035 [US3] Implement duplicate KDocument detection (check existing partnershipId + type K1 + taxYear, prompt UPDATE vs CREATE_NEW per FR-016) in apps/api/src/app/k1-import/k1-import.service.ts
- [X] T036 [US3] Add POST /api/v1/k1-import/:id/confirm endpoint to apps/api/src/app/k1-import/k1-import.controller.ts
- [X] T037 [US3] Create K1 confirmation result component (displays created KDocument summary, member allocations table, distribution records, linked Document) in apps/client/src/app/pages/k1-import/k1-confirmation/k1-confirmation.component.ts
**Checkpoint**: At this point, the complete K-1 import pipeline (upload → extract → verify → confirm → auto-create) is functional — this is the MVP
---
## Phase 6: User Story 4 — Cell Mapping Configuration (Priority: P2)
**Goal**: Administrator can view and customize K-1 cell mapping (rename labels, add custom cells, manage aggregation rules) per partnership, with reset-to-IRS-default capability. Custom mappings are reused for future imports.
**Independent Test**: Modify cell mapping for a partnership (add "Box 20 - Section 199A"), scan a K-1 PDF, verify the custom field appears in verification. Add an aggregation rule, verify computed summary appears on verification and KDocument detail.
### Implementation for User Story 4
- [X] T038 [US4] Implement cell mapping service CRUD (get mappings with global fallback, upsert per-partnership mappings, reset to IRS default, aggregation rule CRUD, compute aggregates for a KDocument) in apps/api/src/app/cell-mapping/cell-mapping.service.ts
- [X] T039 [US4] Implement cell mapping controller (GET /cell-mapping, PUT /cell-mapping, DELETE /cell-mapping/reset, GET /aggregation-rules, PUT /aggregation-rules, GET /aggregation-rules/compute) in apps/api/src/app/cell-mapping/cell-mapping.controller.ts
- [X] T040 [US4] Create cell mapping page component (view/edit cell labels, add custom cells with isCustom flag, manage aggregation rules with source cell selection, reset to defaults button) in apps/client/src/app/pages/cell-mapping/cell-mapping-page.component.ts
- [X] T041 [US4] Integrate per-partnership custom cell mappings into extraction pipeline (field mapper loads partnership-specific mappings, falls back to global defaults for unmapped boxes) in apps/api/src/app/k1-import/k1-field-mapper.service.ts
**Checkpoint**: Cell mapping customization is functional — custom mappings persist across imports
---
## Phase 7: User Story 5 — K-1 Import History & Re-Processing (Priority: P3)
**Goal**: Administrator views import history per partnership, re-processes previously uploaded PDFs with current cell mapping, and manages KDocument status transitions (DRAFT → ESTIMATED → FINAL) for estimated-to-final K-1 workflows
**Independent Test**: Scan two K-1 PDFs for the same partnership/year (one estimated, one final), verify both appear in import history, confirm KDocument status transitions from ESTIMATED to FINAL with updated values
### Implementation for User Story 5
- [X] T042 [US5] Implement import history query (filter by partnershipId and optional taxYear, order by createdAt desc) in apps/api/src/app/k1-import/k1-import.service.ts and GET /api/v1/k1-import/history endpoint in k1-import.controller.ts
- [X] T043 [US5] Implement reprocess endpoint (re-extract stored PDF with current cell mapping, create new session, original session unchanged) in apps/api/src/app/k1-import/k1-import.service.ts and POST /api/v1/k1-import/:id/reprocess in k1-import.controller.ts
- [X] T044 [US5] Add import history list view (date, filename, status, tax year, link to KDocument) to K1 import page in apps/client/src/app/pages/k1-import/k1-import-page.component.ts
- [X] T045 [US5] Implement KDocument status transitions (DRAFT → ESTIMATED → FINAL) with previous value preservation for audit trail (FR-024/FR-025) in apps/api/src/app/k1-import/k1-import.service.ts
- [X] T046 [US5] Extend KDocument detail view with aggregation summary section (display named aggregation totals alongside raw box values per FR-036) in apps/client/src/app/pages/k-document/k-document-detail/
**Checkpoint**: All user stories should now be independently functional
---
## Phase 8: Polish & Cross-Cutting Concerns
**Purpose**: Edge case handling, navigation integration, test fixtures, and end-to-end validation
- [X] T047 [P] Add password-protected PDF detection (FR-029) and multi-entity PDF detection (edge case 5) to upload flow in apps/api/src/app/k1-import/k1-import.service.ts
- [X] T048 [P] Add edge case warnings (EIN mismatch with existing entities, tax year mismatch, zero-extraction warning, ownership % change handling) to verification and confirmation flows in apps/api/src/app/k1-import/k1-import.service.ts
- [X] T049 [P] Add K1 Import and Cell Mapping pages to application navigation/sidebar and register routes in apps/client/src/app/app-routing.module.ts
- [X] T050 [P] Create test fixture K-1 PDF samples (one digital, one scanned) in test/import/sample-k1-digital.pdf and test/import/sample-k1-scanned.pdf
- [X] T051 Run quickstart.md end-to-end workflow validation (upload → extract → review → verify → confirm → check KDocument + Distributions + Document created)
---
## Dependencies & Execution Order
### Phase Dependencies
- **Setup (Phase 1)**: No dependencies — can start immediately
- **Foundational (Phase 2)**: Depends on Setup completion — BLOCKS all user stories
- **User Story 1 (Phase 3)**: Depends on Foundational phase completion
- **User Story 2 (Phase 4)**: Depends on User Story 1 (needs upload/extract to produce data for verification)
- **User Story 3 (Phase 5)**: Depends on User Story 2 (needs verified data to trigger auto-creation)
- **User Story 4 (Phase 6)**: Depends on Foundational phase — can run in parallel with US1–US3 (cell mapping CRUD is independent), but T041 (integration) should follow US1
- **User Story 5 (Phase 7)**: Depends on User Story 3 (needs confirmed imports for history and re-processing)
- **Polish (Phase 8)**: Depends on all desired user stories being complete
### User Story Dependencies
- **User Story 1 (P1)**: Can start after Foundational (Phase 2) — No dependencies on other stories
- **User Story 2 (P1)**: Depends on US1 — needs extraction results to display verification screen
- **User Story 3 (P1)**: Depends on US2 — needs verified data to trigger model object creation
- **User Story 4 (P2)**: Can start after Foundational — cell mapping CRUD is independent; integration task (T041) should follow US1
- **User Story 5 (P3)**: Depends on US3 — needs completed imports for history and status transitions
### Within Each User Story
- DTOs before services (when using validation decorators)
- Services before controllers (business logic before HTTP layer)
- Backend before frontend (API endpoints before UI components)
- Core implementation before integration/cross-cutting tasks
### Parallel Opportunities
**Setup phase**:
- T002 (prisma), T003 (interfaces), T004 (config), T006 (DTOs) — all different files, all parallel
**Foundational phase**:
- T007 (k1-import module) and T008 (cell-mapping module) — different directories, parallel
- T012 (frontend data service) and T013 (routes) — different files, parallel
**User Story 1**:
- T014 (pdf-parse), T015 (azure), T016 (tesseract), T017 (confidence) — four extractor/service files, all parallel
- T019 (upload DTO) parallel with T014–T018
**User Story 2**:
- T023 (verify DTO) parallel with T027 (aggregation service)
**User Story 4**:
- T038 (service) can start in parallel with US1–US3 since cell mapping CRUD is independent
**Polish phase**:
- T047, T048, T049, T050 — all different files/concerns, all parallel
---
## Parallel Example: User Story 1
```bash
# Launch all extractors in parallel (different files):
Task: T014 "Implement pdf-parse extractor in apps/api/src/app/k1-import/extractors/pdf-parse-extractor.ts"
Task: T015 "Implement Azure extractor in apps/api/src/app/k1-import/extractors/azure-extractor.ts"
Task: T016 "Implement tesseract extractor in apps/api/src/app/k1-import/extractors/tesseract-extractor.ts"
Task: T017 "Implement confidence scoring in apps/api/src/app/k1-import/k1-confidence.service.ts"
# Then sequential (dependencies):
Task: T018 "Implement field mapper (depends on extractors + confidence service)"
Task: T020 "Implement import service upload orchestration (depends on T018)"
Task: T021 "Implement controller endpoints (depends on T020)"
Task: T022 "Create frontend upload page (depends on T012 data service + T021 API)"
```
---
## Implementation Strategy
### MVP First (User Stories 1 + 2 + 3)
1. Complete Phase 1: Setup
2. Complete Phase 2: Foundational (CRITICAL — blocks all stories)
3. Complete Phase 3: User Story 1 — Upload & Scan
4. Complete Phase 4: User Story 2 — Review & Verify
5. Complete Phase 5: User Story 3 — Auto-Create Model Objects
6. **STOP and VALIDATE**: Test the complete pipeline end-to-end (upload → extract → verify → confirm → KDocument + Distributions created)
7. Deploy/demo if ready — this covers the core value proposition
### Incremental Delivery
1. **Setup + Foundational** → Foundation ready, modules registered, seed data loaded
2. **Add User Story 1** → Test: PDF upload extracts values → First working extraction
3. **Add User Story 2** → Test: Verification screen with editing → User can review/correct
4. **Add User Story 3** → Test: Confirmation creates model objects → **MVP complete!**
5. **Add User Story 4** → Test: Custom cell mapping persists → Customization available
6. **Add User Story 5** → Test: History + re-processing → Full workflow with audit trail
7. **Polish** → Edge cases, navigation, fixtures → Production-ready
8. Each story adds value without breaking previous stories
### Parallel Team Strategy
With multiple developers:
1. Team completes Setup + Foundational together
2. Once Foundational is done:
- Developer A: User Story 1 → User Story 2 → User Story 3 (sequential — data dependencies)
- Developer B: User Story 4 (parallel — independent cell mapping CRUD)
3. After US3 complete: Developer B takes User Story 5
4. Both developers collaborate on Polish phase
---
## Notes
- [P] tasks = different files, no dependencies on in-progress tasks
- [Story] label maps task to specific user story for traceability
- US1 → US2 → US3 are sequential (data pipeline dependencies) — cannot be parallelized across stories
- US4 (Cell Mapping) CAN start in parallel with US1–US3 (independent CRUD), except T041 (integration)
- Total tasks: 51 (Setup: 6, Foundational: 7, US1: 9, US2: 9, US3: 6, US4: 4, US5: 5, Polish: 5)
- No test tasks included — tests were not explicitly requested in the feature specification
- Commit after each task or logical group
- Stop at any checkpoint to validate story independently

36
specs/005-k1-parser-fix/checklists/requirements.md

@ -0,0 +1,36 @@
# Specification Quality Checklist: Fix K-1 PDF Parser
**Purpose**: Validate specification completeness and quality before proceeding to planning
**Created**: 2025-07-21
**Feature**: [spec.md](../spec.md)
## Content Quality
- [x] No implementation details (languages, frameworks, APIs)
- [x] Focused on user value and business needs
- [x] Written for non-technical stakeholders
- [x] All mandatory sections completed
## Requirement Completeness
- [x] No [NEEDS CLARIFICATION] markers remain
- [x] Requirements are testable and unambiguous
- [x] Success criteria are measurable
- [x] Success criteria are technology-agnostic (no implementation details)
- [x] All acceptance scenarios are defined
- [x] Edge cases are identified
- [x] Scope is clearly bounded
- [x] Dependencies and assumptions identified
## Feature Readiness
- [x] All functional requirements have clear acceptance criteria
- [x] User scenarios cover primary flows
- [x] Feature meets measurable outcomes defined in Success Criteria
- [x] No implementation details leak into specification
## Notes
- All items pass validation. Specification is ready for `/speckit.clarify` or `/speckit.plan`.
- The spec references "position coordinates" and "font discrimination" in the Background section as domain concepts (how K-1 PDFs work), not as implementation instructions. This is intentional — it describes the problem domain, not the solution approach.
- No [NEEDS CLARIFICATION] markers exist — reasonable defaults were applied for all decisions based on the user's detailed field mapping and explicit guidance.

107
specs/005-k1-parser-fix/contracts/extraction.md

@ -0,0 +1,107 @@
# Contract: K1 Extractor Interface
**Feature**: 005-k1-parser-fix | **Date**: 2026-03-18
## Overview
The K1 extraction system uses a strategy pattern where multiple extractors implement the `K1Extractor` interface. This feature rewrites the `PdfParseExtractor` (Tier 1) internals while preserving the interface contract.
## K1Extractor Interface (unchanged)
```typescript
interface K1Extractor {
extract(buffer: Buffer, fileName: string): Promise<K1ExtractionResult>;
isAvailable(): boolean;
}
```
### extract(buffer, fileName)
**Input**:
- `buffer`: Raw PDF file content as a Node.js Buffer
- `fileName`: Original filename of the uploaded PDF (for logging/diagnostics)
**Output**: `K1ExtractionResult` containing:
- `metadata`: Partnership/partner info, tax year, filing status
- `fields`: Array of `K1ExtractedField` (mapped values)
- `unmappedItems`: Array of `K1UnmappedItem` (values that couldn't be mapped)
- `overallConfidence`: 0.0–1.0 aggregate confidence
- `method`: `'pdf-parse'` (this extractor)
- `pagesProcessed`: number (typically 1)
**Error handling**:
- Throws on non-PDF input (invalid buffer)
- Returns empty fields + low confidence for non-K-1 PDFs
- Never crashes on unexpected PDF content
### isAvailable()
Returns `true` always (no external dependencies or API keys needed).
## K1ExtractionResult Shape (expanded)
```typescript
interface K1ExtractionResult {
metadata: {
partnershipName: string | null;
partnershipEin: string | null;
partnerName: string | null;
partnerEin: string | null;
taxYear: number | null;
isAmended: boolean;
isFinal: boolean;
};
fields: K1ExtractedField[];
unmappedItems: K1UnmappedItem[];
overallConfidence: number;
method: 'pdf-parse' | 'azure' | 'tesseract';
pagesProcessed: number;
}
```
## K1ExtractedField Shape (expanded)
```typescript
interface K1ExtractedField {
boxNumber: string; // "1", "6a", "19", "20", "J_PROFIT_BEGIN", etc.
label: string; // Display label
customLabel: string | null; // User override
rawValue: string; // Raw text: "498,211", "(409,811)", "SEE STMT", "X"
numericValue: number | null; // Parsed: 498211, -409811, null, null
confidence: number; // 0.0–1.0
confidenceLevel: 'HIGH' | 'MEDIUM' | 'LOW';
isUserEdited: boolean; // Default false
isReviewed: boolean; // Default false
subtype: string | null; // NEW: "ZZ*", "A", "B", "*", null
fieldCategory: string; // NEW: "PART_III", "METADATA", "SECTION_J", etc.
isCheckbox: boolean; // NEW: true for checkbox fields
}
```
## K1UnmappedItem Shape (expanded)
```typescript
interface K1UnmappedItem {
rawLabel: string;
rawValue: string;
numericValue: number | null;
confidence: number;
pageNumber: number;
resolution: 'assigned' | 'discarded' | null;
assignedBoxNumber: string | null;
x: number; // NEW: x position in PDF points
y: number; // NEW: y position in PDF points
fontName: string; // NEW: PDF font identifier
}
```
## Behavioral Contract
1. **Font discrimination**: The extractor MUST dynamically identify which fonts carry data values vs. template text. It MUST NOT hardcode specific font names.
2. **Position matching**: Each data value MUST be mapped to a K-1 field by checking its (x, y) against defined bounding box regions.
3. **Subtype pairing**: For subtype boxes, code and value items at the same y-position (±8 pts) MUST be paired.
4. **Multi-subtype**: Boxes with multiple subtypes (e.g., box 20) MUST produce separate `K1ExtractedField` entries for each subtype row.
5. **Value parsing**: Parenthesized values MUST become negative. Commas MUST be stripped. "SEE STMT" MUST remain as-is with null numericValue.
6. **Unmapped fallback**: Any data value not matching a region MUST appear in `unmappedItems` — zero data loss.
7. **Cleanup**: The PDF document MUST be destroyed after extraction to free worker resources.
8. **Page scope**: Only page 1 is processed. Multi-page K-1s have supplemental statements on subsequent pages (out of scope).

94
specs/005-k1-parser-fix/data-model.md

@ -0,0 +1,94 @@
# Data Model: Fix K-1 PDF Parser
**Feature**: 005-k1-parser-fix | **Date**: 2026-03-18
## Overview
This feature modifies no database tables. All changes are to in-memory TypeScript interfaces in `@ghostfolio/common`. The extraction result flows through: PDF → extractor → K1ExtractionResult → review UI → confirm → persist to existing KDocument/K1Cell tables.
## Entity Changes
### K1ExtractedField (modified)
Existing interface at `libs/common/src/lib/interfaces/k1-import.interface.ts`. Three new fields added:
| Field | Type | Required | Description |
|-------|------|----------|-------------|
| boxNumber | string | yes | Existing: "1", "6a", "19", "20" |
| label | string | yes | Existing: display label from cell mapping |
| customLabel | string \| null | no | Existing: user override |
| rawValue | string | yes | Existing: raw extracted text ("498,211", "(409,811)", "SEE STMT", "X") |
| numericValue | number \| null | no | Existing: parsed numeric value |
| confidence | number | yes | Existing: 0.0–1.0 |
| confidenceLevel | 'HIGH' \| 'MEDIUM' \| 'LOW' | yes | Existing |
| isUserEdited | boolean | yes | Existing: default false |
| isReviewed | boolean | yes | Existing: default false |
| **subtype** | **string \| null** | **no** | **NEW**: subtype code (e.g., "ZZ*", "A", "B", "*"). Null for simple boxes. |
| **fieldCategory** | **string** | **yes** | **NEW**: "PART_III", "METADATA", "SECTION_J", "SECTION_K", "SECTION_L", "SECTION_M", "SECTION_N", "CHECKBOX" |
| **isCheckbox** | **boolean** | **yes** | **NEW**: true if field is a boolean checkbox value. Default false. |
### K1UnmappedItem (modified)
Existing interface. Three new fields for position debugging:
| Field | Type | Required | Description |
|-------|------|----------|-------------|
| rawLabel | string | yes | Existing |
| rawValue | string | yes | Existing |
| numericValue | number \| null | no | Existing |
| confidence | number | yes | Existing |
| pageNumber | number | yes | Existing |
| resolution | 'assigned' \| 'discarded' \| null | no | Existing |
| assignedBoxNumber | string \| null | no | Existing |
| **x** | **number** | **yes** | **NEW**: x position in PDF points |
| **y** | **number** | **yes** | **NEW**: y position in PDF points |
| **fontName** | **string** | **yes** | **NEW**: PDF font identifier |
### K1ExtractionResult (unchanged)
No changes to the top-level extraction result interface. The `metadata`, `fields`, `unmappedItems`, `overallConfidence`, `method`, and `pagesProcessed` structure remains the same.
### K1PositionRegion (new — internal to extractor)
This is NOT a shared interface — it lives inside the extractor module. It defines a bounding box for a K-1 form field region.
| Field | Type | Description |
|-------|------|-------------|
| fieldId | string | Unique identifier (e.g., "BOX_1", "J_PROFIT_BEGIN", "FINAL_K1") |
| boxNumber | string | K-1 box number for Part III fields; section identifier for others |
| label | string | Display label |
| fieldCategory | string | "PART_III", "METADATA", "SECTION_J", etc. |
| valueType | string | "numeric", "text", "checkbox", "percentage" |
| xMin | number | Left edge in PDF points |
| xMax | number | Right edge in PDF points |
| yMin | number | Bottom edge in PDF points |
| yMax | number | Top edge in PDF points |
| hasSubtype | boolean | Whether this region supports subtype codes |
| subtypeXMin | number \| null | Code column left edge (if hasSubtype) |
| subtypeXMax | number \| null | Code column right edge (if hasSubtype) |
### K1PositionRegion Count: 73 regions
See [research.md](research.md) Decision 3 for the complete region map.
## Validation Rules
1. `boxNumber` must be a valid K-1 box identifier (1-21, a/b/c sub-boxes, or section identifiers J/K/L/M/N)
2. `numericValue` must be null for "SEE STMT" and checkbox fields
3. `isCheckbox: true` requires `rawValue: "X"` and `numericValue: null`
4. `subtype` is only set for boxes that support subtypes (11, 12, 13, 14, 15, 17, 19, 20, 21)
5. Parenthesized values MUST have negative `numericValue`
6. Percentage values (Section J) MUST preserve decimal precision (no rounding)
7. `confidence` must be 0.0–1.0 with HIGH ≥ 0.90, MEDIUM 0.70–0.89, LOW 0.50–0.69
## State Transitions
No state machine changes. The existing K1ImportSession status flow remains:
```
UPLOADING → EXTRACTING → NEEDS_REVIEW → CONFIRMED → COMPLETED
↘ EXTRACTION_FAILED
```
## Database Impact
**None.** No Prisma schema changes. The existing `K1Cell` table stores `boxNumber`, `value`, `label` etc. The new `subtype` field on K1ExtractedField can be concatenated into the existing boxNumber field for storage (e.g., "11-ZZ*", "20-A") or stored via the existing `metadata` JSON column.

81
specs/005-k1-parser-fix/plan.md

@ -0,0 +1,81 @@
# Implementation Plan: Fix K-1 PDF Parser — Position-Based Extraction
**Branch**: `005-k1-parser-fix` | **Date**: 2026-03-20 | **Spec**: [spec.md](spec.md)
**Input**: Feature specification from `/specs/005-k1-parser-fix/spec.md`
**Note**: This template is filled in by the `/speckit.plan` command. See `.specify/templates/plan-template.md` for the execution workflow.
## Summary
Rewrite the K-1 PDF parser from regex-based label matching to position-based text extraction using `pdfjs-dist`. The current regex parser incorrectly matches cell numbers instead of actual data values. The new parser will use font discrimination (data fonts vs template fonts) and (x,y) coordinate mapping to bounding-box regions for each K-1 form field. This fixes extraction for all Part I/II metadata, Part III boxes 1-21 (including subtypes, multi-value fields, and SEE STMT references), checkboxes, and Sections J/K/L/M/N. The existing `PdfParseExtractor` already implements position-based extraction — this spec refines its accuracy and adds confidence scoring, unmapped item handling, and dynamic font identification.
## Technical Context
**Language/Version**: TypeScript 5.x, Node.js ≥22.18.0
**Primary Dependencies**: NestJS 11+, Angular 21+, pdfjs-dist (position-based text extraction), Prisma ORM
**Storage**: PostgreSQL (via Prisma), Redis (caching), filesystem (uploaded PDFs)
**Testing**: Jest (unit + integration)
**Target Platform**: Linux server (Docker) / local dev (Windows/macOS)
**Project Type**: Web application (Nx monorepo: api + client + common + ui)
**Performance Goals**: <5 seconds for single-page K-1 extraction (SC-009)
**Constraints**: Zero data loss during extraction (SC-007); preserve existing API contract (FR-025)
**Scale/Scope**: Single-user family office; ~10-50 K-1 PDFs per tax year
## Constitution Check
_GATE: Must pass before Phase 0 research. Re-check after Phase 1 design._
| Gate | Rule | Status | Notes |
|------|------|--------|-------|
| Nx boundary | Features respect project boundaries (api/client/common/ui) | ✅ PASS | Parser in `@ghostfolio/api`, interfaces in `@ghostfolio/common`, UI in `@ghostfolio/client` |
| NestJS module pattern | Module + Controller + Service structure | ✅ PASS | `K1ImportModule` already exists with proper DI |
| Prisma data layer | No direct SQL; use PrismaService | ✅ PASS | All DB access via Prisma ORM |
| TypeScript strict | No unused locals/params, path aliases | ✅ PASS | Existing codebase conventions followed |
| Simplicity first | YAGNI, minimal abstractions | ✅ PASS | Modifying existing `PdfParseExtractor`, not adding new layers |
| Interface-first design | Shared interfaces in `@ghostfolio/common` | ✅ PASS | `K1ExtractionResult`, `K1ExtractedField`, `K1UnmappedItem` already defined |
| Max 3 Nx projects per feature | api + common typical | ✅ PASS | Touches api + common only (client UI already exists, no changes needed) |
**All gates pass. No violations requiring justification.**
## Project Structure
### Documentation (this feature)
```text
specs/005-k1-parser-fix/
├── plan.md # This file
├── research.md # Phase 0 output
├── data-model.md # Phase 1 output
├── quickstart.md # Phase 1 output
├── contracts/ # Phase 1 output
└── tasks.md # Phase 2 output (/speckit.tasks)
```
### Source Code (repository root)
```text
apps/api/src/app/k1-import/
├── extractors/
│ ├── k1-extractor.interface.ts # K1Extractor contract (no changes)
│ ├── k1-position-regions.ts # MODIFY: refine bounding boxes, add tolerance config
│ ├── pdf-parse-extractor.ts # MODIFY: core rewrite — font discrimination, position mapping
│ ├── azure-extractor.ts # No changes (Tier 2)
│ └── tesseract-extractor.ts # No changes (Tier 2 fallback)
├── k1-import.service.ts # Minor: add warning generation for unmapped items
├── k1-import.controller.ts # No changes
├── k1-field-mapper.service.ts # Minor: handle new confidence levels
├── k1-confidence.service.ts # MODIFY: integrate position-match confidence
└── k1-import.module.ts # No changes
libs/common/src/lib/interfaces/
└── k1-import.interface.ts # Minor: add fontName/position to K1UnmappedItem if needed
prisma/
└── schema.prisma # No changes (existing schema sufficient)
```
**Structure Decision**: Existing Nx monorepo structure is used. The core change is within `apps/api/src/app/k1-import/extractors/` — specifically `pdf-parse-extractor.ts` and `k1-position-regions.ts`. No new modules, no new Nx projects.
## Complexity Tracking
> No violations detected. All changes fit within existing module boundaries.

64
specs/005-k1-parser-fix/quickstart.md

@ -0,0 +1,64 @@
# Quickstart: Fix K-1 PDF Parser
**Feature**: 005-k1-parser-fix | **Date**: 2026-03-18
## Prerequisites
- Node.js 18+ with npm
- Docker running (PostgreSQL + Redis via docker-compose)
- Existing `004-k1-scan-import` feature branch merged or available
## Setup
```bash
# 1. Switch to feature branch
git checkout 005-k1-parser-fix
# 2. Install dependencies (should be no-op — no new packages)
npm install
# 3. Start dev infrastructure
docker compose -f docker/docker-compose.dev.yml up -d
# 4. Run database setup
npm run database:setup
# 5. Start API server
npm run start:server
# 6. Start client (separate terminal)
npm run start:client
```
## Files to Modify
| File | Action | Description |
|------|--------|-------------|
| `libs/common/src/lib/interfaces/k1-import.interface.ts` | MODIFY | Add `subtype`, `fieldCategory`, `isCheckbox` to K1ExtractedField; add `x`, `y`, `fontName` to K1UnmappedItem |
| `apps/api/src/app/k1-import/extractors/pdf-parse-extractor.ts` | REWRITE | Replace regex-based extraction with pdfjs-dist position-based extraction |
| `apps/api/src/app/k1-import/extractors/k1-position-regions.ts` | CREATE | Define 73 bounding box regions for all K-1 form fields |
## Testing
```bash
# Upload a K-1 PDF via the API
curl -X POST http://localhost:3333/api/v1/k1-import/upload \
-H "Authorization: Bearer <token>" \
-F "file=@path/to/k1.pdf"
# Check extraction results
curl http://localhost:3333/api/v1/k1-import/session/<session-id> \
-H "Authorization: Bearer <token>"
```
## Verification Checklist
- [ ] Box 11 extracted with subtype "ZZ*" and value -409615
- [ ] Box 19 extracted with subtype "A" and value 4493757
- [ ] Box 20 extracted with 4 separate subtype entries (A, B, Z, *)
- [ ] Box 21 extracted with subtype "*" and value 196
- [ ] Section J percentages extracted (3.032900, 0.000000)
- [ ] Section L capital values extracted with correct signs
- [ ] Final K-1 checkbox detected as true
- [ ] Unmapped items list is empty (all values mapped) for the reference PDF
- [ ] Non-K-1 PDF produces error, not garbage data

221
specs/005-k1-parser-fix/research.md

@ -0,0 +1,221 @@
# Research: Fix K-1 PDF Parser
**Feature**: 005-k1-parser-fix | **Date**: 2026-03-18
## Research Summary
All technical unknowns resolved. Three key decisions made:
1. **pdfjs-dist** for position-based text extraction (already installed)
2. **Font discrimination + position region mapping** as the extraction strategy
3. **73 bounding box regions** defined covering all K-1 form fields
---
## Decision 1: PDF Parsing Library
**Decision**: Use `pdfjs-dist` v5.4.296 directly (already installed as transitive dependency of pdf-parse v2.4.5)
**Rationale**:
- Already installed — no new npm dependencies
- `page.getTextContent()` returns `TextItem` objects with precise (x, y) coordinates, font name, width, height
- `@napi-rs/canvas` v0.1.80 (also already installed) provides DOMMatrix polyfill for Node.js via the legacy build
- The legacy build at `pdfjs-dist/legacy/build/pdf.mjs` auto-polyfills `DOMMatrix`, `ImageData`, `Path2D`, and `navigator`
**Alternatives considered**:
- **pdf-parse v2.4.5** (currently used): Wraps pdfjs-dist but does NOT expose position coordinates. Only returns concatenated text strings. Insufficient for position-based extraction.
- **pdf-lib**: Can read AcroForm fields, but K-1 PDFs have zero AcroForm fields (values are text overlays). Not useful.
- **pdf2json**: Older PDF.js fork with positioned text. Redundant — pdfjs-dist v5.4 is already available and more current.
### API Details
**Import** (must use dynamic import — API project compiles to CommonJS via webpack):
```typescript
const { getDocument, GlobalWorkerOptions } = await import('pdfjs-dist/legacy/build/pdf.mjs');
```
**Worker configuration** (required in v5.4.x):
```typescript
const workerPath = 'file:///' + resolve('node_modules/pdfjs-dist/legacy/build/pdf.worker.mjs').replace(/\\/g, '/');
GlobalWorkerOptions.workerSrc = workerPath;
```
**Document loading**:
```typescript
const loadingTask = getDocument({
data: new Uint8Array(buffer),
standardFontDataUrl: resolve('node_modules/pdfjs-dist/standard_fonts') + '/',
cMapUrl: resolve('node_modules/pdfjs-dist/cmaps') + '/',
cMapPacked: true,
isEvalSupported: false,
disableFontFace: true,
});
```
**Text extraction**:
```typescript
const page = await pdfDoc.getPage(1);
const textContent = await page.getTextContent({ includeMarkedContent: false });
// textContent.items: TextItem[] with { str, transform, width, height, fontName, hasEOL, dir }
// textContent.styles: { [fontName]: { fontFamily, ascent, descent, vertical } }
// transform[4] = x, transform[5] = y (PDF points, origin bottom-left)
```
**Cleanup** (required):
```typescript
await pdfDoc.destroy(); // Terminates worker, frees resources
```
### Gotchas
1. Must use `pdfjs-dist/legacy/build/pdf.mjs` — main build crashes with `DOMMatrix is not defined`
2. Must set `GlobalWorkerOptions.workerSrc` to the worker file path — empty string no longer works in v5.4.x
3. `workerSrc` must be a `file://` URL on Windows
4. Use `await import()` not static `import` — CommonJS compat via webpack
5. Y-coordinates are bottom-up: `transform[5]` = 792 is top of page, 0 is bottom
6. `page.view` gives `[0, 0, 612, 792]` — standard US Letter
---
## Decision 2: Extraction Strategy
**Decision**: Hybrid approach — font discrimination (primary) + position-based region mapping (secondary)
**Rationale**:
- Font filtering instantly isolates ~30 data values from 467 total text items on page 1
- Position mapping then determines exactly which K-1 field each value belongs to
- Two-phase filtering is more robust than either approach alone
- Resilient to minor position variations across different K-1 generators
**Alternatives considered**:
- **Regex label matching** (current approach): Fundamentally broken — pdf-parse outputs all template labels first, then all data values separately. Labels and values are never adjacent in the text stream.
- **Sequential positional parsing** (text order): Fragile — depends on exact text ordering which varies between generators. Also can't distinguish data values from template text.
- **Pure position-based** (no font check): Would work but requires matching against all 73 regions for all 467 items. Font filtering first reduces the problem to ~30 items × 73 regions.
### Font Discrimination Details
From the sample K-1 PDF, text items use these fonts:
| fontName | fontFamily | Usage | Count |
|----------|-----------|-------|-------|
| g_d0_f1 | serif | Template labels, headers | ~350 items |
| g_d0_f2 | sans-serif | "20" in tax year | 1 item |
| g_d0_f3 | sans-serif | "25" in tax year (data) | 1 item |
| g_d0_f5 | serif | Footnotes, small text | ~80 items |
| g_d0_f6 | sans-serif | Data values | ~10 items |
| g_d0_f7 | monospace | Checkboxes/codes | ~5 items |
| g_d0_f8 | sans-serif | Data values (primary) | ~20 items |
**Key insight**: Template labels exclusively use `serif` fonts. Data values exclusively use `sans-serif` or `monospace` fonts. Filtering by `fontFamily !== 'serif'` isolates all data values.
**Dynamic detection**: Since font names vary across generators, the algorithm should:
1. Get all unique fonts from `textContent.styles`
2. Identify template fonts: the fonts used by known template text items (items matching "Schedule K-1", "Form 1065", "Ordinary business income", etc.)
3. Non-template fonts = data fonts
4. Filter items to only those using data fonts
---
## Decision 3: Position Region Map
**Decision**: Define 73 bounding box regions covering all K-1 form fields with ±15 pt tolerance
**Rationale**:
- K-1 form layout is standardized by the IRS — position regions are consistent across generators
- 22 positions verified from actual PDF extraction with exact coordinates
- Remaining ~51 positions interpolated from verified anchors and standard IRS form spacing
- ±15 pt tolerance handles minor variations between generators
### Verified Anchor Points (from actual K-1 PDF)
| Value | x | y | Field |
|-------|-----|-------|-------|
| "X" | 324.3 | 746.2 | FINAL_K1 |
| "X" | 180.3 | 446.6 | G_LIMITED |
| "X" | 58.0 | 422.9 | H1_DOMESTIC |
| "3.032900" | 139.1 | 339.1 | J_PROFIT_BEGIN |
| "0.000000" | 250.1 | 339.1 | J_PROFIT_END |
| "498,211" | 180.8 | 254.5 | K_NONRECOURSE_BEGIN |
| "X" | 294.9 | 205.8 | K2_CHECKBOX |
| "4,903,568" | 257.8 | 157.4 | L_BEG_CAPITAL |
| "(409,811)" | 259.3 | 133.7 | L_CURR_YR_INCOME |
| "4,493,757" | 257.8 | 109.4 | L_WITHDRAWALS |
| "X" | 101.2 | 74.2 | M_NO |
| "(5,373)" | 271.5 | 49.7 | N_BEGINNING |
| "(409,811)" | 92.1 | 2.8 | N_ENDING |
| "ZZ*" | 314.2 | 314.4 | BOX_11_CODE |
| "(409,615)" | 403.9 | 314.4 | BOX_11_VALUE |
| "X" | 563.3 | 603.8 | BOX_16_K3 |
| "A" | 455.2 | 423.2 | BOX_19_CODE |
| "4,493,757" | 530.6 | 422.0 | BOX_19_VALUE |
| "*" | 456.4 | 267.1 | BOX_21_CODE |
| "196" | 555.6 | 266.1 | BOX_21_VALUE |
### Region Layout Summary
| Group | X range | Y range | Fields |
|-------|---------|---------|--------|
| Header | 120–450 | 731–785 | 5: TAX_YEAR, TAX_YEAR_BEGIN/END, FINAL_K1, AMENDED_K1 |
| Part I | 30–290 | 610–735 | 4: A_EIN, B_NAME, B_ADDR, C_IRS_CENTER |
| Part II | 30–306 | 350–610 | 12: D through I2 |
| Section J | 120–305 | 285–354 | 7: profit/loss/capital begin/end + decrease sale |
| Section K | 155–310 | 176–270 | 8: nonrecourse/qual/recourse begin/end + K2/K3 checkboxes |
| Section L | 220–306 | 83–173 | 6: beg/contributed/income/other/withdrawals/end |
| Section M | 50–120 | 59–89 | 2: M_YES, M_NO |
| Section N | 60–306 | 0–65 | 2: N_BEGINNING, N_ENDING |
| Part III Left | 300–455 | 245–698 | 19: boxes 1–13 (including a/b/c sub-boxes) |
| Part III Right | 440–595 | 245–710 | 8: boxes 14–21 |
### Subtype Handling
Boxes 11, 12, 13 (left column) and 14, 15, 17, 19, 20, 21 (right column) can have subtype codes:
- **Left column**: code at x ≈ 300–350, value at x ≈ 370–455
- **Right column**: code at x ≈ 440–475, value at x ≈ 510–595
Pairing algorithm: find code and value items on the same y-line (within ±8 pts).
Box 20 supports multiple subtype rows (A, B, V/Z, *) spaced ~23 pts apart within y range 275–395.
---
## Decision 4: Numeric Value Parsing
**Decision**: Parse all K-1 values using consistent rules
**Rationale**: IRS K-1 forms use standard US financial formatting. No ambiguity in the parsing rules.
**Rules**:
1. Remove commas: "4,903,568" → "4903568"
2. Parenthesized = negative: "(409,811)" → "-409811" → -409811
3. Leading minus = negative: "-5,373" → -5373
4. Dollar sign: strip "$" if present
5. Decimal percentages: "3.032900" → 3.032900 (preserve precision, do not round)
6. "SEE STMT" / "STMT" → `numericValue: null`, `rawValue: "SEE STMT"`
7. "X" (checkbox) → boolean true, `rawValue: "X"`
8. Empty / whitespace → omit field or `numericValue: 0`
9. "E-FILE" and other text values → `numericValue: null`, preserve as rawValue
---
## Decision 5: Interface Expansion
**Decision**: Add `subtype`, `fieldCategory`, and `isCheckbox` to `K1ExtractedField`; add position info to `K1UnmappedItem`
**Rationale**: The existing interface lacks fields needed for subtype codes (box 11 "ZZ*", box 20 "A"/"B"), field categorization (Part III vs Section J vs metadata), and checkbox discrimination. Adding these fields is backward-compatible (all optional/nullable).
**New fields on K1ExtractedField**:
- `subtype: string | null` — subtype code (e.g., "ZZ*", "A", "B", "*")
- `fieldCategory: 'PART_III' | 'METADATA' | 'SECTION_J' | 'SECTION_K' | 'SECTION_L' | 'SECTION_M' | 'SECTION_N' | 'CHECKBOX'`
- `isCheckbox: boolean` — whether this field is a boolean checkbox value
**New fields on K1UnmappedItem**:
- `x: number` — x position in PDF points
- `y: number` — y position in PDF points
- `fontName: string` — font identifier for debugging
---
## Open Items
None. All NEEDS CLARIFICATION items resolved.

202
specs/005-k1-parser-fix/spec.md

@ -0,0 +1,202 @@
# Feature Specification: Fix K-1 PDF Parser — Position-Based Extraction
**Feature Branch**: `005-k1-parser-fix`
**Created**: 2025-07-21
**Status**: Draft
**Input**: User description: "Fix K-1 PDF parser to correctly extract positional values from IRS Schedule K-1 (Form 1065) PDFs. The current regex-based parser matches cell numbers as values instead of actual data. Rewrite using position-based extraction with pdfjs-dist to reliably map form field values by their (x, y) coordinates and font discrimination. Support all Part I/II metadata fields, Part III income/deduction boxes (1-21), subtype codes, checkboxes, percentages, and 'SEE STMT' references. Allow users to manually map any ambiguous or unrecognized fields."
## Background
E-filed IRS Schedule K-1 (Form 1065) PDFs have a specific structure: form template text (labels, headings, instructions) and data values are rendered as separate text overlays on the same page. When extracted as plain text, all template text appears first, followed by all data values in a flat positional list — without any labels attached to the values. The current regex-based parser attempts to match labels to adjacent values, which fundamentally fails because labels and values are in completely different sections of the extracted text.
### PDF Structure Discovery
Analysis of a real e-filed K-1 PDF reveals:
- **467 total text items** on page 1
- **Zero AcroForm fields** — values are positioned text overlays, not fillable form fields
- **Font discrimination**: All data values use a distinct font (e.g., `g_d0_f8`) that differs from template text fonts
- **Position coordinates**: Each text item has precise (x, y) coordinates via the PDF transformation matrix
- **K-1 form layout**: Three distinct regions — Part I/II (left column, partnership/partner info), Part III left column (boxes 1-13), Part III right column (boxes 14-21)
- **Subtype codes**: Some boxes (11, 19, 20, 21) have letter/symbol codes as separate text items in the same y-band as their values
- **Checkboxes**: Represented as "X" text items at checkbox positions
## User Scenarios & Testing _(mandatory)_
### User Story 1 — Accurate K-1 Value Extraction (Priority: P1)
As an investor uploading an e-filed K-1 PDF, I want the system to correctly extract all Part III box values (boxes 1-21) so that my income, deductions, and credits are accurately captured without manual correction.
**Why this priority**: This is the core value proposition. If Part III box values are wrong, the entire K-1 import feature is unusable. Every K-1 has Part III data, and getting it right eliminates the most painful manual data entry.
**Independent Test**: Upload a sample e-filed K-1 PDF and verify that all Part III boxes with values are correctly extracted with the right box number, value, and sign (parenthesized values as negative).
**Acceptance Scenarios**:
1. **Given** an e-filed K-1 PDF with box 1 value "498,211", **When** the PDF is uploaded and parsed, **Then** box 1 is extracted with `rawValue: "498,211"` and `numericValue: 498211`
2. **Given** an e-filed K-1 PDF with box 11 having subtype "ZZ*" and value "(409,615)", **When** parsed, **Then** box 11 is extracted with `boxNumber: "11"`, `subtype: "ZZ*"`, `rawValue: "(409,615)"`, and `numericValue: -409615`
3. **Given** an e-filed K-1 PDF with box 19 subtype "A" and value "4,493,757", **When** parsed, **Then** box 19 is extracted with `boxNumber: "19"`, `subtype: "A"`, `rawValue: "4,493,757"`, and `numericValue: 4493757`
4. **Given** an e-filed K-1 PDF with box 20 subtypes A, B, Z, and * all showing "SEE STMT", **When** parsed, **Then** four separate fields are extracted for box 20, each with the correct subtype code and `rawValue: "SEE STMT"`, `numericValue: null`
5. **Given** an e-filed K-1 PDF with box 21 having subtype "*" and value "196", **When** parsed, **Then** box 21 is extracted with `boxNumber: "21"`, `subtype: "*"`, `rawValue: "196"`, and `numericValue: 196`
6. **Given** a value in parentheses like "(409,811)", **When** parsed, **Then** the numericValue is `-409811` (negative)
7. **Given** an empty box (no value present), **When** parsed, **Then** the box is either omitted from results or included with `numericValue: 0`
---
### User Story 2 — Partnership & Partner Metadata Extraction (Priority: P1)
As an investor, I want the system to extract Part I (partnership info) and Part II (partner info) metadata — including names, EINs, addresses, tax year, and filing status — so I can match K-1 documents to the correct partnership and tax period.
**Why this priority**: Metadata is essential for identifying which partnership and partner the K-1 belongs to, and for the tax year assignment. Without this, K-1 data cannot be properly filed.
**Independent Test**: Upload a K-1 PDF and verify that partnership name, EIN, partner name, EIN, tax year, and final/amended status are correctly extracted.
**Acceptance Scenarios**:
1. **Given** an e-filed K-1 that is marked "Final K-1", **When** parsed, **Then** `metadata.isFinal` is `true`
2. **Given** an e-filed K-1 for tax year 2025, **When** parsed, **Then** `metadata.taxYear` is `2025`
3. **Given** a K-1 with IRS Center field showing "E-FILE", **When** parsed, **Then** the IRS center metadata field is captured as "E-FILE"
---
### User Story 3 — Part I/II Financial Fields Extraction (Priority: P2)
As an investor, I want the system to extract Part I/II financial fields — Section J (profit/loss/capital percentages), Section K (liabilities), Section L (capital account analysis), Section M (contributed property), and Section N (partner share of net income) — so that my partnership interest details are fully captured.
**Why this priority**: These fields provide the partnership interest context (ownership percentages, capital account, liabilities) needed for tax reporting. They are secondary to Part III income boxes but still required for a complete K-1 record.
**Independent Test**: Upload a K-1 PDF and verify J/K/L/M/N sections are extracted with correct begin/end values and signs.
**Acceptance Scenarios**:
1. **Given** a K-1 with Section J showing profit beginning "3.032900" and ending "0.000000", **When** parsed, **Then** fields are extracted: J_PROFIT_BEGIN = 3.032900, J_PROFIT_END = 0.000000
2. **Given** a K-1 with Section J loss and capital rows identical to profit, **When** parsed, **Then** J_LOSS_BEGIN, J_LOSS_END, J_CAPITAL_BEGIN, J_CAPITAL_END are all correctly extracted
3. **Given** a K-1 with Section K showing nonrecourse beginning "498,211", **When** parsed, **Then** K_NONRECOURSE_BEGIN = 498211
4. **Given** a K-1 with Section L showing beginning capital "4,903,568", withdrawals "4,493,757", and current year net income/loss "(409,811)", **When** parsed, **Then** L_BEG_CAP = 4903568, L_WITHD_DIST = 4493757, L_CURR_YR_INCOME = -409811
5. **Given** a K-1 with Section M checkbox "No" marked, **When** parsed, **Then** M_CONTRIBUTED_PROPERTY = false (or "NO")
6. **Given** a K-1 with Section N showing beginning "(5,373)" and ending "(409,811)", **When** parsed, **Then** N_BEG = -5373, N_END = -409811
---
### User Story 4 — Checkbox and Boolean Field Extraction (Priority: P2)
As an investor, I want checkbox fields (Final K-1, Amended K-1, General/Limited partner, Domestic/Foreign partner, K-2/K-3 attached indicators) to be correctly identified as boolean values so they accurately reflect my filing status.
**Why this priority**: Checkboxes determine filing status and partner classification. Misidentifying them can lead to incorrect tax treatment. They are simpler to extract (just "X" at a position) but critical to get right.
**Independent Test**: Upload a K-1 PDF with known checkbox states and verify all checkboxes are correctly identified as checked or unchecked.
**Acceptance Scenarios**:
1. **Given** a K-1 with "Final K-1" checked and "Amended K-1" unchecked, **When** parsed, **Then** `isFinal: true`, `isAmended: false`
2. **Given** a K-1 with "Limited partner" checked, **When** parsed, **Then** the partner type field reflects "Limited"
3. **Given** a K-1 with "Domestic partner" checked, **When** parsed, **Then** the partner domestic/foreign field reflects "Domestic"
4. **Given** a K-1 with box 16 "K-3 attached" checked, **When** parsed, **Then** box 16 reflects `true`
---
### User Story 5 — Manual Mapping Fallback for Ambiguous Fields (Priority: P3)
As an investor, when the parser cannot confidently map a value to a specific K-1 field (due to unexpected positioning, font, or layout variation), I want to see those values listed as "unmapped" so I can manually assign them to the correct fields through the review interface.
**Why this priority**: No parser is perfect. Different K-1 generators may produce slightly different layouts. Providing a manual mapping fallback ensures data is never lost and users always have control, even when automatic extraction is imperfect.
**Independent Test**: Upload a K-1 PDF where some values fall outside expected position regions, and verify those values appear in the unmapped items list for manual assignment.
**Acceptance Scenarios**:
1. **Given** a K-1 PDF where a value appears at an unexpected position, **When** parsed, **Then** that value appears in the `unmappedItems` list with its raw text, position, and page number
2. **Given** an unmapped item in the review interface, **When** the user assigns it to box "4", **Then** it moves to the extracted fields list as box 4 with the assigned value
3. **Given** an unmapped item, **When** the user marks it as "discarded", **Then** it is excluded from the final import
---
### Edge Cases
- **Multi-page K-1**: Some K-1s span multiple pages. The parser should handle page 1 (the standard K-1 form) and recognize that subsequent pages are supplemental statements, not additional K-1 data to parse.
- **All-empty K-1**: A K-1 with zero data values (all boxes empty) should produce an extraction result with no fields and no errors.
- **Negative values**: Parenthesized values like "(40,029)" must be parsed as negative numbers (-40029). Plain minus signs (e.g., "-5,373") should also be handled.
- **"SEE STMT" references**: Some boxes contain "SEE STMT" (See Statement) instead of a numeric value. These should be captured as-is with `numericValue: null`.
- **Tab-separated subtype/value pairs**: Values like "ZZ*\t(409,615)" or "A\t4,493,757" contain a subtype code tab-separated from the value. Both parts must be captured.
- **Multiple subtypes per box**: Box 20 can have multiple rows (A, B, Z, *), each with its own value. All must be extracted as separate fields.
- **Non-standard fonts**: Different K-1 generators may use different font names. The parser should identify data fonts dynamically rather than hardcoding a specific font name.
- **Corrupted or non-K-1 PDFs**: If a PDF has no recognizable K-1 structure (no matching template text), extraction should fail gracefully with a meaningful error message, not crash.
- **Percentage values**: Section J values are decimal percentages (e.g., "3.032900"). These should be preserved as-is without rounding.
## Requirements _(mandatory)_
### Functional Requirements
#### Core Extraction
- **FR-001**: System MUST extract all Part III box values (boxes 1 through 21) from e-filed K-1 PDFs using position-based text extraction rather than regex label matching
- **FR-002**: System MUST extract each text item's position coordinates (x, y) and font information from the PDF to determine which form field a value belongs to
- **FR-003**: System MUST discriminate between template text (labels/headings) and data values using font characteristics as the primary differentiator, with position and content pattern as secondary signals
- **FR-004**: System MUST define position regions (bounding boxes) for each K-1 form field and map extracted data values to the correct field based on which region their coordinates fall within
- **FR-005**: System MUST parse parenthesized values as negative numbers: "(409,811)" → -409811
- **FR-006**: System MUST handle comma-separated thousands in numeric values: "4,903,568" → 4903568
- **FR-007**: System MUST preserve "SEE STMT" values as raw text with a null numeric value and not attempt numeric parsing
#### Subtype and Multi-Value Fields
- **FR-008**: System MUST extract subtype codes for boxes that support them (boxes 11, 12, 13, 14, 19, 20, 21) where a letter or symbol code appears as a separate text item in the same vertical band as the value
- **FR-009**: System MUST support multiple subtype rows per box (e.g., box 20 with subtypes A, B, Z, and *)
- **FR-010**: System MUST capture tab-separated subtype/value pairs where the code and value appear on the same text line
#### Metadata and Part I/II
- **FR-011**: System MUST extract Part I/II metadata including: partnership name, partnership EIN, partner name, partner EIN, tax year, IRS center, and filing status (final/amended)
- **FR-012**: System MUST extract Section J percentage fields (profit, loss, capital — beginning and ending)
- **FR-013**: System MUST extract Section K liability fields (nonrecourse, qualified nonrecourse, recourse — beginning and ending as available)
- **FR-014**: System MUST extract Section L capital account fields (beginning capital, capital contributed, current year net income/loss, other increase/decrease, withdrawals/distributions, ending capital)
- **FR-015**: System MUST extract Section M (contributed property indicator) and Section N (partner share of net unrecognized 704(c) gain/loss — beginning and ending)
#### Checkbox Fields
- **FR-016**: System MUST identify checkbox fields marked with "X" at known checkbox positions (Final K-1, Amended K-1, General/Limited partner, Domestic/Foreign partner, K-2/K-3 attached)
- **FR-017**: System MUST represent checkbox values as boolean (true = "X" present at the checkbox position, false = absent)
#### Confidence and Unmapped Items
- **FR-018**: System MUST assign a confidence level (HIGH, MEDIUM, LOW) to each extracted field based on how precisely the value's position matches the expected region
- **FR-019**: System MUST place any extracted value that does not fall within a defined position region into the "unmapped items" list, capturing the raw text, position, and page number
- **FR-020**: System MUST allow users to manually assign unmapped items to specific box numbers through the existing review interface
- **FR-021**: System MUST allow users to discard unmapped items they determine are irrelevant
#### Robustness
- **FR-022**: System MUST handle K-1 PDFs from different e-filing generators that may use different font names by dynamically identifying which font is used for data values
- **FR-023**: System MUST gracefully handle PDFs that are not K-1 forms or have unrecognizable layouts, returning a meaningful error rather than crashing
- **FR-024**: System MUST process only page 1 of the K-1 PDF for standard form data extraction (supplemental statement pages are out of scope for this feature)
- **FR-025**: System MUST preserve the existing extraction interface contract so that upstream services (K1 import service, review UI) continue to work without changes
### Key Entities
- **K1ExtractedField**: A single parsed value from the K-1 form. Key attributes: box number, optional subtype code, raw text value, parsed numeric value, confidence level, field category (Part III box, Part I/II metadata, Section J/K/L/M/N), and whether it's a checkbox.
- **K1PositionRegion**: A defined bounding area on the K-1 form page corresponding to a specific field. Attributes: field identifier, x-min, x-max, y-min, y-max, expected value type (numeric, text, checkbox, percentage).
- **K1UnmappedItem**: A data value extracted from the PDF that couldn't be mapped to any defined position region. Attributes: raw text, x/y position, page number, user resolution (assigned/discarded/pending).
- **K1ExtractionResult**: The complete output of parsing a K-1 PDF. Contains metadata (partnership, partner, tax year, filing status), mapped fields array, unmapped items array, overall confidence, and extraction method identifier.
## Success Criteria _(mandatory)_
### Measurable Outcomes
- **SC-001**: For a standard e-filed K-1 PDF, all Part III boxes with values are extracted with the correct box number and value in a single upload — no manual corrections needed for the reference test PDF
- **SC-002**: Numeric values including negative (parenthesized) amounts are parsed correctly with 100% accuracy for well-formed values
- **SC-003**: All subtype codes (e.g., box 11 "ZZ*", box 19 "A", box 20 "A"/"B"/"Z"/"*") are correctly paired with their values
- **SC-004**: Part I/II metadata (tax year, filing status, partner type) is extracted correctly
- **SC-005**: Section J percentages, Section K liabilities, Section L capital account, and Section N values are extracted with correct signs and decimal precision
- **SC-006**: Users can review and correct any extraction result through the existing review interface within 2 minutes
- **SC-007**: Values that cannot be automatically mapped appear in the unmapped items list, ensuring zero data loss during extraction
- **SC-008**: Non-K-1 PDFs produce a clear error message rather than incorrect/garbage data
- **SC-009**: Extraction completes within 5 seconds for a single-page K-1 PDF
## Assumptions
- All K-1 PDFs follow the standard IRS Schedule K-1 (Form 1065) layout for 2025 and adjacent tax years. Custom or non-standard K-1 formats are not in scope.
- E-filed K-1 PDFs render values as positioned text overlays (not AcroForm fields). The system does not need to support fillable PDF form field extraction.
- The existing review/confirmation UI and data flow (upload → extract → review → confirm) remains unchanged. Only the extraction logic is being rewritten.
- Font names vary across K-1 generators; the parser will dynamically identify the data font rather than hardcoding a specific font name.
- "SEE STMT" references indicate supplemental statement pages exist but parsing those supplemental pages is out of scope for this feature.
- PDF page coordinates use standard PDF coordinate system (origin at bottom-left, y increases upward).
- The position region map is calibrated for the standard IRS K-1 form layout; minor position adjustments may be needed over time as different generators are encountered.

237
specs/005-k1-parser-fix/tasks.md

@ -0,0 +1,237 @@
# Tasks: Fix K-1 PDF Parser — Position-Based Extraction
**Input**: Design documents from `/specs/005-k1-parser-fix/`
**Prerequisites**: plan.md, spec.md, research.md, data-model.md, contracts/extraction.md, quickstart.md
**Tests**: Not explicitly requested — test tasks omitted.
**Organization**: Tasks grouped by user story to enable independent implementation and testing.
## Format: `[ID] [P?] [Story] Description`
- **[P]**: Can run in parallel (different files, no dependencies)
- **[Story]**: Which user story this task belongs to (US1–US5)
- Exact file paths included in all descriptions
## Path Conventions
- **Monorepo (Nx)**: `apps/api/src/`, `libs/common/src/`
- **Extractor module**: `apps/api/src/app/k1-import/extractors/`
- **Shared interfaces**: `libs/common/src/lib/interfaces/`
---
## Phase 1: Setup
**Purpose**: Expand shared interfaces to support new extraction fields
- [x] T001 Add `subtype: string | null`, `fieldCategory: string`, and `isCheckbox: boolean` to K1ExtractedField interface, and add `x: number`, `y: number`, `fontName: string` to K1UnmappedItem interface in libs/common/src/lib/interfaces/k1-import.interface.ts
---
## Phase 2: Foundational (Blocking Prerequisites)
**Purpose**: Core extraction infrastructure that ALL user stories depend on — pdfjs-dist integration, position regions, font discrimination, value parsing
**⚠️ CRITICAL**: No user story work can begin until this phase is complete
- [x] T002 [P] Create K1PositionRegion interface and export all 73 bounding box region definitions (Header, Part I, Part II, Sections J/K/L/M/N, Part III left boxes 1-13, Part III right boxes 14-21) with ±15pt tolerance using verified anchor coordinates from research.md in apps/api/src/app/k1-import/extractors/k1-position-regions.ts
- [x] T003 Replace existing regex-based extraction with pdfjs-dist scaffold: dynamic `await import('pdfjs-dist/legacy/build/pdf.mjs')`, GlobalWorkerOptions.workerSrc set to `file://` path of pdf.worker.mjs, getDocument() with buffer, getPage(1), getTextContent(), and pdfDoc.destroy() cleanup in apps/api/src/app/k1-import/extractors/pdf-parse-extractor.ts
- [x] T004 Implement dynamic font discrimination using textContent.styles: classify each font as template (serif fontFamily) or data (sans-serif/monospace fontFamily), filter text items to only data-font items in apps/api/src/app/k1-import/extractors/pdf-parse-extractor.ts
- [x] T005 Implement findRegionForPosition() function that takes (x, y) coordinates and returns the matching K1PositionRegion from the 73-region map using ±15pt bounding box tolerance, or null if no match in apps/api/src/app/k1-import/extractors/pdf-parse-extractor.ts
- [x] T006 Implement parseK1Value() utility: strip commas, parenthesized values → negative number, leading minus → negative, "SEE STMT" → numericValue null, "X" → checkbox true, dollar sign strip, preserve decimal percentages without rounding in apps/api/src/app/k1-import/extractors/pdf-parse-extractor.ts
**Checkpoint**: Foundation ready — pdfjs-dist loads PDFs, data-font items are isolated, positions match to regions, values parse correctly
---
## Phase 3: User Story 1 — Accurate K-1 Value Extraction (Priority: P1) 🎯 MVP
**Goal**: Extract all Part III box values (boxes 1-21) with correct box numbers, values, signs, and subtype codes
**Independent Test**: Upload a sample K-1 PDF and verify Part III boxes are correctly extracted — box 1 = 498,211; box 11 ZZ* = (409,615); box 19 A = 4,493,757; box 20 with 4 subtypes; box 21 * = 196
### Implementation for User Story 1
- [x] T007 [US1] Implement Part III extraction loop: iterate data-font items, match to Part III regions (left column boxes 1-13, right column boxes 14-21), build K1ExtractedField with boxNumber, rawValue, numericValue, fieldCategory='PART_III' in apps/api/src/app/k1-import/extractors/pdf-parse-extractor.ts
- [x] T008 [US1] Implement subtype code pairing: for regions with hasSubtype=true, find code text item and value text item at same y-band (±8pts) using subtypeXMin/XMax ranges from k1-position-regions.ts, set subtype field on K1ExtractedField in apps/api/src/app/k1-import/extractors/pdf-parse-extractor.ts
- [x] T009 [US1] Handle multi-subtype boxes (box 20 with A, B, Z, * at ~23pt vertical spacing): produce separate K1ExtractedField entry for each subtype/value pair within the box's y-range in apps/api/src/app/k1-import/extractors/pdf-parse-extractor.ts
- [x] T010 [US1] Wire Part III extraction into the main extract() method: call extraction after font filtering and position matching, merge Part III fields into K1ExtractionResult.fields array in apps/api/src/app/k1-import/extractors/pdf-parse-extractor.ts
**Checkpoint**: Part III boxes 1-21 fully extracted with subtypes — User Story 1 independently testable via upload
---
## Phase 4: User Story 2 — Partnership & Partner Metadata Extraction (Priority: P1)
**Goal**: Extract Part I (partnership info) and Part II (partner info) metadata — names, EINs, addresses, tax year, filing status
**Independent Test**: Upload a K-1 PDF and verify partnership name, EIN, partner name, tax year, and final/amended status are correctly populated on K1ExtractionResult.metadata
### Implementation for User Story 2
- [x] T011 [US2] Implement header region extraction: match data items to Header regions for tax year (combine "20" + "25"), tax year begin/end dates, Final K-1 flag, Amended K-1 flag in apps/api/src/app/k1-import/extractors/pdf-parse-extractor.ts
- [x] T012 [US2] Implement Part I extraction: match data items to Part I regions for partnership EIN (field A), partnership name and address (field B), and IRS Center (field C) in apps/api/src/app/k1-import/extractors/pdf-parse-extractor.ts
- [x] T013 [US2] Implement Part II extraction: match data items to Part II regions for partner EIN (field D), partner name (field E), address (field F), and partner type general/limited (field G) and domestic/foreign (field H) in apps/api/src/app/k1-import/extractors/pdf-parse-extractor.ts
- [x] T014 [US2] Assemble K1ExtractionResult.metadata object from extracted header, Part I, and Part II fields, setting partnershipName, partnershipEin, partnerName, partnerEin, taxYear, isFinal, isAmended in apps/api/src/app/k1-import/extractors/pdf-parse-extractor.ts
**Checkpoint**: Metadata fully populated — User Story 2 independently testable via upload
---
## Phase 5: User Story 3 — Part I/II Financial Fields Extraction (Priority: P2)
**Goal**: Extract Sections J (percentages), K (liabilities), L (capital account), M (contributed property), N (net 704(c) gain/loss)
**Independent Test**: Upload a K-1 PDF and verify Section J percentages (3.032900 / 0.000000), Section K nonrecourse (498,211), Section L capital values with correct signs, Section N values are extracted
### Implementation for User Story 3
- [x] T015 [US3] Implement Section J extraction: match data items to 7 Section J regions for profit/loss/capital beginning and ending percentages, plus decrease-in-sale field, with fieldCategory='SECTION_J' in apps/api/src/app/k1-import/extractors/pdf-parse-extractor.ts
- [x] T016 [US3] Implement Section K extraction: match data items to 8 Section K regions for nonrecourse/qualified nonrecourse/recourse beginning and ending liabilities, plus K-2/K-3 checkbox regions, with fieldCategory='SECTION_K' in apps/api/src/app/k1-import/extractors/pdf-parse-extractor.ts
- [x] T017 [US3] Implement Section L extraction: match data items to 6 Section L regions for beginning capital, capital contributed, current year net income/loss, other increase/decrease, withdrawals/distributions, ending capital with fieldCategory='SECTION_L' in apps/api/src/app/k1-import/extractors/pdf-parse-extractor.ts
- [x] T018 [US3] Implement Section M (contributed property yes/no checkbox) and Section N (beginning and ending net 704(c) gain/loss values) extraction with fieldCategory='SECTION_M' and 'SECTION_N' respectively in apps/api/src/app/k1-import/extractors/pdf-parse-extractor.ts
**Checkpoint**: All J/K/L/M/N financial fields extracted — User Story 3 independently testable
---
## Phase 6: User Story 4 — Checkbox and Boolean Field Extraction (Priority: P2)
**Goal**: Detect all checkbox fields (Final K-1, Amended K-1, General/Limited, Domestic/Foreign, K-2/K-3 attached) as boolean values
**Independent Test**: Upload a K-1 PDF with known checkbox states and verify Final K-1 = true, Limited partner = true, Domestic = true, box 16 K-3 attached = true
### Implementation for User Story 4
- [x] T019 [US4] Implement checkbox detection: for all regions with valueType='checkbox', check if an "X" text item exists at the checkbox position, build K1ExtractedField with rawValue="X", numericValue=null, isCheckbox=true, fieldCategory='CHECKBOX' for checked boxes in apps/api/src/app/k1-import/extractors/pdf-parse-extractor.ts
- [x] T020 [US4] Ensure unchecked checkboxes are either omitted or included with rawValue="" and isCheckbox=true to distinguish from missing data, and verify checkbox fields set on K1ExtractionResult.metadata (isFinal, isAmended) in apps/api/src/app/k1-import/extractors/pdf-parse-extractor.ts
**Checkpoint**: All checkbox fields correctly detected as boolean values — User Story 4 independently testable
---
## Phase 7: User Story 5 — Manual Mapping Fallback for Ambiguous Fields (Priority: P3)
**Goal**: Data-font values that don't match any region appear in unmappedItems with position info for manual assignment
**Independent Test**: Upload a K-1 PDF where some values fall outside expected regions and verify those values appear in unmappedItems with raw text, x, y, fontName, pageNumber
### Implementation for User Story 5
- [x] T021 [US5] After all region matching is complete, collect remaining unmatched data-font items into K1UnmappedItem[] with rawLabel='', rawValue, numericValue (parsed), confidence=0.5, pageNumber=1, x, y, fontName in apps/api/src/app/k1-import/extractors/pdf-parse-extractor.ts
- [x] T022 [US5] Verify unmapped items integrate with existing review UI manual assignment flow: ensure assignedBoxNumber and resolution fields on K1UnmappedItem work with the confirmation endpoint in apps/api/src/app/k1-import/k1-import.service.ts
**Checkpoint**: Zero data loss — all extracted values either mapped to fields or available in unmappedItems for manual assignment
---
## Phase 8: Polish & Cross-Cutting Concerns
**Purpose**: Error handling, confidence scoring, cleanup, and service integration
- [x] T023 Implement graceful error handling: wrap extraction in try/catch, return empty fields + low confidence + meaningful error for non-K-1 and corrupted PDFs, never crash on unexpected content in apps/api/src/app/k1-import/extractors/pdf-parse-extractor.ts
- [x] T024 Implement confidence scoring: HIGH (≥0.90) when value center is within region center ±5pts, MEDIUM (0.70-0.89) within ±10pts, LOW (0.50-0.69) at tolerance boundary ±15pts; compute overallConfidence as weighted average in apps/api/src/app/k1-import/extractors/pdf-parse-extractor.ts
- [x] T025 Ensure pdfDoc.destroy() cleanup runs in all code paths (success, error, empty result) using try/finally in apps/api/src/app/k1-import/extractors/pdf-parse-extractor.ts
- [x] T026 [P] Update k1-import.service.ts to handle new subtype field when building K1Cell records — concatenate subtype into boxNumber (e.g., "11-ZZ*", "20-A") or store via metadata JSON column in apps/api/src/app/k1-import/k1-import.service.ts
- [x] T027 Run quickstart.md verification checklist: upload test K-1 PDF, verify all 9 checklist items pass (box 11/19/20/21, Section J/L, Final K-1 checkbox, unmapped empty, non-K-1 error)
---
## Dependencies & Execution Order
### Phase Dependencies
- **Setup (Phase 1)**: No dependencies — can start immediately
- **Foundational (Phase 2)**: T002 can run in parallel with T001 (different files). T003-T006 depend on T001 (interface types) and execute sequentially in pdf-parse-extractor.ts
- **User Stories (Phase 3-7)**: ALL depend on Foundational phase completion (T001-T006)
- US1 (Phase 3) and US2 (Phase 4): Both P1, execute sequentially (same file)
- US3 (Phase 5) and US4 (Phase 6): Both P2, execute after US1+US2 (same file)
- US5 (Phase 7): P3, executes last of user stories
- **Polish (Phase 8)**: T023-T025 depend on all user stories. T026 is independent (different file, marked [P])
### User Story Dependencies
- **US1 (P1)**: Depends only on Foundational. No dependency on other stories.
- **US2 (P1)**: Depends only on Foundational. No dependency on US1 (metadata vs Part III are separate regions).
- **US3 (P2)**: Depends only on Foundational. J/K/L/M/N regions are independent of Part III.
- **US4 (P2)**: Depends only on Foundational. Checkbox detection is position-based, independent of value extraction. Some overlap with US2 (Final/Amended checkboxes set metadata flags).
- **US5 (P3)**: Depends on US1-US4 being done (unmapped = whatever's left after all matching).
### Within Each User Story
- Region matching before subtype pairing
- Subtype pairing before multi-subtype handling
- Core extraction before wiring into extract()
- Story complete before moving to next priority
### Parallel Opportunities
- **T001 + T002**: Interface expansion and position regions file — different files, no dependencies
- **T026**: Service update — different file from extractor, can run in parallel with T023-T025
- **US1-US4**: While all modify the same extractor file (sequential), each story's extraction logic is a self-contained function that could theoretically be developed in parallel branches
---
## Parallel Example: Foundational Phase
```
# These two tasks can run simultaneously:
Task T001: "Expand interfaces in k1-import.interface.ts"
Task T002: "Create position regions in k1-position-regions.ts"
# Then sequentially in pdf-parse-extractor.ts:
Task T003: "Scaffold pdfjs-dist infrastructure"
Task T004: "Font discrimination logic"
Task T005: "Position matching engine"
Task T006: "Value parsing utility"
```
## Parallel Example: Polish Phase
```
# These can run simultaneously (different files):
Task T023-T025: "Error handling, confidence, cleanup in pdf-parse-extractor.ts"
Task T026: "Service subtype handling in k1-import.service.ts"
# Final validation after all above:
Task T027: "Run quickstart.md verification checklist"
```
---
## Implementation Strategy
### MVP First (User Story 1 Only)
1. Complete Phase 1: Setup (T001) — interface expansion
2. Complete Phase 2: Foundational (T002-T006) — pdfjs-dist + regions + font + parsing
3. Complete Phase 3: User Story 1 (T007-T010) — Part III boxes 1-21
4. **STOP and VALIDATE**: Upload test K-1 PDF, verify Part III extraction
5. This delivers the core value — accurate box values replace broken regex parser
### Incremental Delivery
1. Setup + Foundational → Infrastructure ready
2. Add US1 (Part III) → Test independently → **MVP!**
3. Add US2 (Metadata) → Test independently → Metadata populated
4. Add US3 (J/K/L/M/N) → Test independently → Financial fields complete
5. Add US4 (Checkboxes) → Test independently → Boolean fields detected
6. Add US5 (Unmapped) → Test independently → Zero data loss guaranteed
7. Polish → Error handling, confidence, service integration
### Single Developer Flow
All user story tasks modify the same extractor file, so execute sequentially:
Phase 1 → Phase 2 → Phase 3 (US1) → Phase 4 (US2) → Phase 5 (US3) → Phase 6 (US4) → Phase 7 (US5) → Phase 8 (Polish)
---
## Notes
- All 73 position regions are defined in T002 upfront — individual story phases use them
- No new npm dependencies required (pdfjs-dist already installed via pdf-parse)
- The extractor rewrite preserves the existing K1Extractor interface contract (extract + isAvailable)
- Keep isDigitalK1() from the existing extractor — it's used by isAvailable()
- Font names are dynamic — never hardcode specific font names like "g_d0_f8"
- Total: 27 tasks across 8 phases covering 5 user stories

43
test/import/ok/sample-k1-digital.txt

@ -0,0 +1,43 @@
K-1 Test Fixture: Digital PDF
================================
This file documents the expected test data for a digital (text-based) K-1 PDF.
Replace this file with an actual PDF for integration testing.
Expected Extraction Method: pdf-parse (Tier 1)
Expected Confidence: HIGH (>= 0.85) for all fields
--- Form Header ---
Schedule K-1 (Form 1065)
Partner's Share of Income, Deductions, Credits, etc.
Tax Year: 2024
Partnership EIN: 12-3456789
Partnership Name: Test Investment Partners, LP
Partner Name: Test Entity LLC
Partner EIN: 98-7654321
--- Part III: Partner's Share ---
Box 1 - Ordinary business income (loss): 125,000
Box 2 - Net rental real estate income (loss): -15,000
Box 3 - Other net rental income (loss): 0
Box 4 - Guaranteed payments for services: 50,000
Box 5 - Interest income: 8,500
Box 6a - Ordinary dividends: 12,000
Box 6b - Qualified dividends: 9,500
Box 7 - Royalties: 0
Box 8 - Net short-term capital gain (loss): 3,200
Box 9a - Net long-term capital gain (loss): 45,000
Box 9b - Collectibles (28%) gain (loss): 0
Box 9c - Unrecaptured section 1250 gain: 2,100
Box 10 - Net section 1231 gain (loss): 0
Box 11 - Other income (loss): 1,500
Box 12 - Section 179 deduction: 0
Box 13 - Other deductions: -4,200
Box 14 - Self-employment earnings (loss): 50,000
Box 15 - Credits: 0
Box 16 - Foreign transactions: 0
Box 17 - Alternative minimum tax (AMT) items: 0
Box 18 - Tax-exempt income and nondeductible expenses: 0
Box 19a - Distributions (cash): 75,000
Box 19b - Distributions (property): 0
Box 20 - Other information: 0
Box 21 - Foreign taxes paid or accrued: 0

50
test/import/ok/sample-k1-scanned.txt

@ -0,0 +1,50 @@
K-1 Test Fixture: Scanned PDF
================================
This file documents the expected test data for a scanned (image-based) K-1 PDF.
Replace this file with an actual scanned PDF for integration testing.
Expected Extraction Method: azure (Tier 2) or tesseract (Tier 2 fallback)
Expected Confidence: MEDIUM (0.60-0.84) for most fields due to OCR uncertainty
--- Form Header ---
Schedule K-1 (Form 1065)
Partner's Share of Income, Deductions, Credits, etc.
Tax Year: 2023
Partnership EIN: 55-1234567
Partnership Name: Scanned Capital Fund, LP
Partner Name: Member Entity Inc.
Partner EIN: 77-9876543
--- Part III: Partner's Share ---
Box 1 - Ordinary business income (loss): -32,500
Box 2 - Net rental real estate income (loss): 0
Box 3 - Other net rental income (loss): 0
Box 4 - Guaranteed payments for services: 0
Box 5 - Interest income: 2,100
Box 6a - Ordinary dividends: 5,800
Box 6b - Qualified dividends: 4,200
Box 7 - Royalties: 0
Box 8 - Net short-term capital gain (loss): -1,500
Box 9a - Net long-term capital gain (loss): 18,750
Box 9b - Collectibles (28%) gain (loss): 0
Box 9c - Unrecaptured section 1250 gain: 0
Box 10 - Net section 1231 gain (loss): 0
Box 11 - Other income (loss): 0
Box 12 - Section 179 deduction: 0
Box 13 - Other deductions: -2,800
Box 14 - Self-employment earnings (loss): 0
Box 15 - Credits: 0
Box 16 - Foreign transactions: 0
Box 17 - Alternative minimum tax (AMT) items: 0
Box 18 - Tax-exempt income and nondeductible expenses: 750
Box 19a - Distributions (cash): 25,000
Box 19b - Distributions (property): 0
Box 20 - Other information: 0
Box 21 - Foreign taxes paid or accrued: 350
--- OCR Simulation Notes ---
This fixture simulates a scanned PDF where:
- Some numeric values may have OCR artifacts (e.g., "l" vs "1", "O" vs "0")
- Confidence scores should reflect Tier 2 extraction uncertainty
- The Azure DI or tesseract extractors handle these ambiguities
- Expected to generate MEDIUM confidence for most fields

74
tools/extract-k1-positions.mjs

@ -0,0 +1,74 @@
/**
* Utility to extract all text items with their (x, y) positions from a K-1 PDF.
* This dumps every text item with coordinates so we can calibrate position regions.
*
* Usage: node tools/extract-k1-positions.mjs <path-to-pdf>
*/
import { readFileSync } from 'fs';
import { resolve } from 'path';
// Dynamic import of pdfjs-dist legacy build
const { getDocument, GlobalWorkerOptions } = await import(
'pdfjs-dist/legacy/build/pdf.mjs'
);
const workerPath =
'file:///' +
resolve('node_modules/pdfjs-dist/legacy/build/pdf.worker.mjs').replace(
/\\/g,
'/'
);
GlobalWorkerOptions.workerSrc = workerPath;
const pdfPath = process.argv[2];
if (!pdfPath) {
console.error('Usage: node tools/extract-k1-positions.mjs <path-to-pdf>');
process.exit(1);
}
const buffer = readFileSync(pdfPath);
const loadingTask = getDocument({
data: new Uint8Array(buffer),
standardFontDataUrl: resolve('node_modules/pdfjs-dist/standard_fonts') + '/',
cMapUrl: resolve('node_modules/pdfjs-dist/cmaps') + '/',
cMapPacked: true,
isEvalSupported: false,
disableFontFace: true
});
const pdfDoc = await loadingTask.promise;
console.log(`Pages: ${pdfDoc.numPages}`);
for (let pageNum = 1; pageNum <= Math.min(pdfDoc.numPages, 2); pageNum++) {
console.log(`\n=== PAGE ${pageNum} ===\n`);
const page = await pdfDoc.getPage(pageNum);
const textContent = await page.getTextContent({ includeMarkedContent: false });
const items = textContent.items;
const styles = textContent.styles;
// Sort by y descending (top of page first), then x ascending
const sorted = [...items].sort((a, b) => {
const dy = b.transform[5] - a.transform[5];
if (Math.abs(dy) > 2) return dy;
return a.transform[4] - b.transform[4];
});
for (const item of sorted) {
const text = item.str.trim();
if (!text) continue;
const x = Math.round(item.transform[4] * 10) / 10;
const y = Math.round(item.transform[5] * 10) / 10;
const style = styles[item.fontName] || {};
const fontFamily = style.fontFamily || 'unknown';
const isData = fontFamily.toLowerCase() !== 'serif';
console.log(
`${isData ? 'DATA' : 'TMPL'} | x=${String(x).padStart(7)} | y=${String(y).padStart(7)} | font=${fontFamily.padEnd(15)} | "${text}"`
);
}
}
await pdfDoc.destroy();
console.log('\nDone.');

427
tools/test-k1-parse.mjs

@ -0,0 +1,427 @@
/**
* Test script: runs the PdfParseExtractor logic directly on a K-1 PDF
* and prints all extracted fields, metadata, and unmapped items.
*
* Usage: node tools/test-k1-parse.mjs <path-to-pdf>
*/
import { readFileSync } from 'fs';
import { resolve } from 'path';
// ── pdfjs-dist setup ──
const { getDocument, GlobalWorkerOptions } = await import(
'pdfjs-dist/legacy/build/pdf.mjs'
);
const workerPath =
'file:///' +
resolve('node_modules/pdfjs-dist/legacy/build/pdf.worker.mjs').replace(
/\\/g,
'/'
);
GlobalWorkerOptions.workerSrc = workerPath;
// ── Load k1-position-regions (need TS compilation) ──
// For simplicity, inline the region definitions from the compiled output.
// Instead, we'll replicate the core extraction logic here using the raw
// coordinates from the TypeScript file.
// Actually, let's just load the TS file via tsx or esbuild-register...
// Simplest approach: read the compiled JS from dist or use a bundler.
// For now, let's inline the critical logic.
const POSITION_TOLERANCE = 15;
const SUBTYPE_Y_TOLERANCE = 8;
// ── Import the regions by dynamically compiling the TS ──
// We'll use a quick inline approach: load the raw TS and eval via esbuild
import { execSync } from 'child_process';
import { writeFileSync, unlinkSync, existsSync } from 'fs';
// Build a temp bundle of just the regions file
const regionsTsPath = resolve(
'apps/api/src/app/k1-import/extractors/k1-position-regions.ts'
);
const regionsTmpPath = resolve('tools/_tmp_regions.mjs');
try {
execSync(
`npx esbuild "${regionsTsPath}" --bundle --format=esm --outfile="${regionsTmpPath}" --platform=node`,
{ stdio: 'pipe' }
);
} catch (e) {
console.error('Failed to compile regions file:', e.stderr?.toString());
process.exit(1);
}
const regionsModule = await import('file:///' + regionsTmpPath.replace(/\\/g, '/'));
const K1_POSITION_REGIONS = regionsModule.K1_POSITION_REGIONS;
// Clean up
try { unlinkSync(regionsTmpPath); } catch {}
// ── PDF parsing ──
const pdfPath = process.argv[2];
if (!pdfPath) {
console.error('Usage: node tools/test-k1-parse.mjs <path-to-pdf>');
process.exit(1);
}
const buffer = readFileSync(pdfPath);
const loadingTask = getDocument({
data: new Uint8Array(buffer),
standardFontDataUrl: resolve('node_modules/pdfjs-dist/standard_fonts') + '/',
cMapUrl: resolve('node_modules/pdfjs-dist/cmaps') + '/',
cMapPacked: true,
isEvalSupported: false,
disableFontFace: true
});
const pdfDoc = await loadingTask.promise;
const page = await pdfDoc.getPage(1);
const textContent = await page.getTextContent({ includeMarkedContent: false });
const items = textContent.items;
const styles = textContent.styles;
// Filter data items (non-serif)
const dataItems = [];
for (const item of items) {
const text = item.str.trim();
if (!text) continue;
const style = styles[item.fontName];
if (!style) continue;
const fontFamily = style.fontFamily.toLowerCase();
if (fontFamily === 'serif') continue;
dataItems.push({
text,
x: item.transform[4],
y: item.transform[5],
fontName: item.fontName,
fontFamily,
matched: false
});
}
console.log(`Total data items: ${dataItems.length}\n`);
// ── Parsing logic (mirrors PdfParseExtractor) ──
function parseNumericValue(raw) {
if (!raw) return null;
const trimmed = raw.trim();
if (!trimmed) return null;
const upper = trimmed.toUpperCase();
if (['SEE STMT', 'STMT', 'SEE STATEMENT', 'X', 'E-FILE', 'YES', 'NO'].includes(upper))
return null;
let cleaned = trimmed;
const isParenNeg = /^\(.*\)$/.test(cleaned);
cleaned = cleaned.replace(/[$,()]/g, '');
const isMinusNeg = cleaned.startsWith('-');
if (isMinusNeg) cleaned = cleaned.substring(1);
const num = parseFloat(cleaned);
if (isNaN(num)) return null;
return isParenNeg || isMinusNeg ? -num : num;
}
function findBestItemInRegion(items, region) {
let bestItem = null;
let bestDist = Infinity;
const cx = (region.xMin + region.xMax) / 2;
const cy = (region.yMin + region.yMax) / 2;
for (const item of items) {
if (item.matched) continue;
if (
item.x >= region.xMin - POSITION_TOLERANCE &&
item.x <= region.xMax + POSITION_TOLERANCE &&
item.y >= region.yMin - POSITION_TOLERANCE &&
item.y <= region.yMax + POSITION_TOLERANCE
) {
const dx = Math.abs(item.x - cx);
const dy = Math.abs(item.y - cy);
const d = Math.sqrt(dx * dx + dy * dy);
if (d < bestDist) {
bestDist = d;
bestItem = item;
}
}
}
return bestItem;
}
const fields = [];
const metadata = {
partnershipName: null,
partnershipEin: null,
partnerName: null,
partnerEin: null,
taxYear: null,
isAmended: false,
isFinal: false
};
// Closest-center assignment helper
function assignItemsToRegions(items, regions) {
const candidates = [];
for (const item of items) {
if (item.matched) continue;
for (const region of regions) {
if (
item.x >= region.xMin - POSITION_TOLERANCE &&
item.x <= region.xMax + POSITION_TOLERANCE &&
item.y >= region.yMin - POSITION_TOLERANCE &&
item.y <= region.yMax + POSITION_TOLERANCE
) {
const cx = (region.xMin + region.xMax) / 2;
const cy = (region.yMin + region.yMax) / 2;
const dx = Math.abs(item.x - cx);
const dy = Math.abs(item.y - cy);
candidates.push({ item, region, distance: Math.sqrt(dx*dx + dy*dy) });
}
}
}
candidates.sort((a, b) => a.distance - b.distance);
const result = new Map();
const usedItems = new Set();
for (const { item, region } of candidates) {
if (usedItems.has(item) || result.has(region)) continue;
result.set(region, item);
usedItems.add(item);
}
return result;
}
// 1. Checkboxes (closest-center assignment)
const checkboxRegions = K1_POSITION_REGIONS.filter(r => r.valueType === 'checkbox');
const cbAssignments = assignItemsToRegions(dataItems, checkboxRegions);
const checkedRegionIds = new Set();
for (const [region, item] of cbAssignments) {
const isChecked = ['X', '✓', '✗'].includes(item.text.toUpperCase());
if (!isChecked) continue;
checkedRegionIds.add(region.fieldId);
fields.push({
fieldId: region.fieldId,
boxNumber: region.boxNumber,
label: region.label,
rawValue: 'true',
numericValue: null,
fieldCategory: 'CHECKBOX',
isCheckbox: true,
subtype: null
});
item.matched = true;
if (region.fieldId === 'FINAL_K1') metadata.isFinal = true;
if (region.fieldId === 'AMENDED_K1') metadata.isAmended = true;
}
// Emit false for unchecked checkbox regions
for (const region of checkboxRegions) {
if (checkedRegionIds.has(region.fieldId)) continue;
fields.push({
fieldId: region.fieldId,
boxNumber: region.boxNumber,
label: region.label,
rawValue: 'false',
numericValue: null,
fieldCategory: 'CHECKBOX',
isCheckbox: true,
subtype: null
});
}
// 2. Part III — subtype regions first, then simple
const partIIIRegions = K1_POSITION_REGIONS.filter(
r => r.fieldCategory === 'PART_III' && r.valueType !== 'checkbox'
);
const subtypeRegions = partIIIRegions.filter(r => r.hasSubtype);
const simpleRegions = partIIIRegions.filter(r => !r.hasSubtype);
function extractSubtypeField(region) {
const codes = [];
const values = [];
for (const item of dataItems) {
if (item.matched) continue;
const inY = item.y >= region.yMin - POSITION_TOLERANCE &&
item.y <= region.yMax + POSITION_TOLERANCE;
if (!inY) continue;
if (region.subtypeXMin !== null && region.subtypeXMax !== null &&
item.x >= region.subtypeXMin - POSITION_TOLERANCE &&
item.x <= region.subtypeXMax + POSITION_TOLERANCE) {
codes.push(item);
} else if (item.x >= region.xMin - POSITION_TOLERANCE &&
item.x <= region.xMax + POSITION_TOLERANCE) {
values.push(item);
}
}
if (codes.length > 0) {
for (const code of codes) {
const paired = values.find(v => !v.matched && Math.abs(v.y - code.y) <= SUBTYPE_Y_TOLERANCE);
const raw = paired ? paired.text : '';
fields.push({
fieldId: region.fieldId,
boxNumber: region.boxNumber,
label: region.label,
rawValue: raw,
numericValue: parseNumericValue(raw),
fieldCategory: region.fieldCategory,
isCheckbox: false,
subtype: code.text.trim()
});
code.matched = true;
if (paired) paired.matched = true;
}
} else if (values.length > 0) {
const item = values[0];
fields.push({
fieldId: region.fieldId,
boxNumber: region.boxNumber,
label: region.label,
rawValue: item.text,
numericValue: parseNumericValue(item.text),
fieldCategory: region.fieldCategory,
isCheckbox: false,
subtype: null
});
item.matched = true;
}
}
for (const region of subtypeRegions) {
extractSubtypeField(region);
}
for (const region of simpleRegions) {
const item = findBestItemInRegion(dataItems, region);
if (!item) continue;
fields.push({
fieldId: region.fieldId,
boxNumber: region.boxNumber,
label: region.label,
rawValue: item.text,
numericValue: parseNumericValue(item.text),
fieldCategory: region.fieldCategory,
isCheckbox: false,
subtype: null
});
item.matched = true;
}
// 3. Metadata — tax year (lowered threshold from 745 to 710)
const taxYearItems = [];
for (const item of dataItems) {
if (item.matched) continue;
if (item.y > 710 && item.x > 200 && item.x < 350) {
if (/^\d{2,4}$/.test(item.text)) {
taxYearItems.push(item);
}
}
}
if (taxYearItems.length >= 2) {
taxYearItems.sort((a, b) => a.x - b.x);
const combined = taxYearItems.map(i => i.text).join('');
const year = parseInt(combined, 10);
if (year >= 1900 && year <= 2100) {
metadata.taxYear = year;
for (const item of taxYearItems) item.matched = true;
}
}
// Text metadata
function extractTextMetadata(regionFieldId, metadataKey) {
const region = K1_POSITION_REGIONS.find(r => r.fieldId === regionFieldId);
if (!region) return;
const matches = [];
for (const item of dataItems) {
if (item.matched) continue;
if (
item.x >= region.xMin - POSITION_TOLERANCE &&
item.x <= region.xMax + POSITION_TOLERANCE &&
item.y >= region.yMin - POSITION_TOLERANCE &&
item.y <= region.yMax + POSITION_TOLERANCE
) {
matches.push(item);
}
}
if (matches.length === 0) return;
matches.sort((a, b) => b.y - a.y);
const combined = matches.map(m => m.text).join(' ').trim();
if (metadataKey && combined) {
metadata[metadataKey] = combined;
}
for (const item of matches) item.matched = true;
}
extractTextMetadata('A_EIN', 'partnershipEin');
extractTextMetadata('B_NAME', 'partnershipName');
extractTextMetadata('C_IRS_CENTER', null);
extractTextMetadata('E_TIN', 'partnerEin');
extractTextMetadata('F_NAME_ADDR', 'partnerName');
// Remaining metadata regions
const metadataRegions = K1_POSITION_REGIONS.filter(
r => r.fieldCategory === 'METADATA' && r.valueType === 'text'
);
for (const region of metadataRegions) {
const item = findBestItemInRegion(dataItems, region);
if (!item) continue;
fields.push({
fieldId: region.fieldId,
boxNumber: region.boxNumber,
label: region.label,
rawValue: item.text,
numericValue: parseNumericValue(item.text),
fieldCategory: region.fieldCategory,
isCheckbox: false,
subtype: null
});
item.matched = true;
}
// 4. Sections J/K/L/M/N (closest-center assignment)
for (const cat of ['SECTION_J', 'SECTION_K', 'SECTION_L', 'SECTION_M', 'SECTION_N']) {
const regions = K1_POSITION_REGIONS.filter(r => r.fieldCategory === cat && r.valueType !== 'checkbox');
const assignments = assignItemsToRegions(dataItems, regions);
for (const [region, item] of assignments) {
fields.push({
fieldId: region.fieldId,
boxNumber: region.boxNumber,
label: region.label,
rawValue: item.text,
numericValue: parseNumericValue(item.text),
fieldCategory: region.fieldCategory,
isCheckbox: false,
subtype: null
});
item.matched = true;
}
}
// ── Print results ──
console.log('=== METADATA ===');
console.log(JSON.stringify(metadata, null, 2));
console.log('\n=== EXTRACTED FIELDS ===');
// Group by category
const byCategory = {};
for (const f of fields) {
const cat = f.fieldCategory;
if (!byCategory[cat]) byCategory[cat] = [];
byCategory[cat].push(f);
}
for (const [cat, catFields] of Object.entries(byCategory)) {
console.log(`\n--- ${cat} ---`);
for (const f of catFields) {
const sub = f.subtype ? ` [${f.subtype}]` : '';
const num = f.numericValue !== null ? ` (=${f.numericValue})` : '';
console.log(` ${f.fieldId || f.boxNumber}: "${f.rawValue}"${sub}${num}`);
}
}
// Unmapped
const unmapped = dataItems.filter(i => !i.matched && (i.text.length > 1 || /\d/.test(i.text) || i.text === 'X'));
console.log(`\n=== UNMAPPED ITEMS (${unmapped.length}) ===`);
for (const u of unmapped) {
const x = Math.round(u.x * 10) / 10;
const y = Math.round(u.y * 10) / 10;
console.log(` "${u.text}" at (${x}, ${y}) font=${u.fontFamily}`);
}
await pdfDoc.destroy();
console.log('\nDone.');
Loading…
Cancel
Save