mirror of https://github.com/ghostfolio/ghostfolio
25 changed files with 2035 additions and 295 deletions
@ -0,0 +1,274 @@ |
|||
Pages: 1 |
|||
|
|||
=== PAGE 1 === |
|||
|
|||
DATA | x= 524.2 | y= 758.7 | font=monospace | "651123" |
|||
TMPL | x= 511 | y= 748.4 | font=serif | "OMB No. 1545-0123" |
|||
DATA | x= 324.3 | y= 746.2 | font=sans-serif | "X" |
|||
TMPL | x= 336 | y= 746 | font=serif | "Final K-1" |
|||
TMPL | x= 415.2 | y= 746 | font=serif | "Amended K-1" |
|||
TMPL | x= 36 | y= 735.8 | font=serif | "Schedule K-1" |
|||
TMPL | x= 319.1 | y= 734.9 | font=serif | "Part III" |
|||
TMPL | x= 360 | y= 735.4 | font=serif | "PartnerΓÇÖs Share of Current Year Income," |
|||
DATA | x= 236.8 | y= 727.7 | font=sans-serif | "20" |
|||
DATA | x= 262.1 | y= 727.7 | font=sans-serif | "25" |
|||
TMPL | x= 36 | y= 723.8 | font=serif | "(Form 1065)" |
|||
TMPL | x= 360 | y= 723.4 | font=serif | "Deductions, Credits, and Other Items" |
|||
TMPL | x= 36 | y= 713.5 | font=serif | "Department of the Treasury" |
|||
TMPL | x= 318.5 | y= 712 | font=serif | "1" |
|||
TMPL | x= 334.2 | y= 712 | font=serif | "Ordinary business income (loss)" |
|||
TMPL | x= 453.3 | y= 712 | font=serif | "14" |
|||
TMPL | x= 471 | y= 712 | font=serif | "Self-employment earnings (loss)" |
|||
TMPL | x= 36 | y= 705.5 | font=serif | "Internal Revenue Service" |
|||
TMPL | x= 193.2 | y= 703 | font=serif | "For calendar year 2025, or tax year" |
|||
TMPL | x= 71 | y= 686 | font=serif | "beginning" |
|||
TMPL | x= 129.6 | y= 687 | font=serif | "/" |
|||
DATA | x= 151.2 | y= 686.8 | font=sans-serif | "/" |
|||
DATA | x= 159 | y= 686.8 | font=sans-serif | "2025" |
|||
TMPL | x= 195.6 | y= 686 | font=serif | "ending" |
|||
TMPL | x= 244.8 | y= 687 | font=serif | "/" |
|||
TMPL | x= 266.4 | y= 687 | font=serif | "/" |
|||
TMPL | x= 318.5 | y= 688 | font=serif | "2" |
|||
TMPL | x= 333.2 | y= 688 | font=serif | "Net rental real estate income (loss)" |
|||
TMPL | x= 36 | y= 669.6 | font=serif | "PartnerΓÇÖs Share of Income, Deductions," |
|||
TMPL | x= 318.5 | y= 664 | font=serif | "3" |
|||
TMPL | x= 334.2 | y= 664.1 | font=serif | "Other net rental income (loss)" |
|||
TMPL | x= 453.3 | y= 664 | font=serif | "15" |
|||
TMPL | x= 471 | y= 664 | font=serif | "Credits" |
|||
TMPL | x= 36 | y= 656.6 | font=serif | "Credits, etc." |
|||
TMPL | x= 215.2 | y= 656.8 | font=serif | "See separate instructions." |
|||
TMPL | x= 48.4 | y= 638.9 | font=serif | "Part I" |
|||
TMPL | x= 86.4 | y= 638.9 | font=serif | "Information About the Partnership" |
|||
TMPL | x= 316.4 | y= 640 | font=serif | "4a" |
|||
TMPL | x= 334.2 | y= 640 | font=serif | "Guaranteed payments for services" |
|||
TMPL | x= 40.8 | y= 626 | font=serif | "A" |
|||
TMPL | x= 316.3 | y= 616 | font=serif | "4b" |
|||
TMPL | x= 334.2 | y= 616 | font=serif | "Guaranteed payments for capital" |
|||
TMPL | x= 453.3 | y= 616 | font=serif | "16" |
|||
TMPL | x= 472 | y= 616 | font=serif | "Schedule K-3 is attached if" |
|||
TMPL | x= 472 | y= 606 | font=serif | "checked" |
|||
TMPL | x= 504 | y= 606 | font=serif | "." |
|||
TMPL | x= 516 | y= 606 | font=serif | "." |
|||
TMPL | x= 528 | y= 606 | font=serif | "." |
|||
TMPL | x= 540 | y= 606 | font=serif | "." |
|||
TMPL | x= 552 | y= 606 | font=serif | "." |
|||
TMPL | x= 40.7 | y= 602 | font=serif | "B" |
|||
DATA | x= 563.3 | y= 603.8 | font=sans-serif | "X" |
|||
TMPL | x= 316.4 | y= 592 | font=serif | "4c" |
|||
TMPL | x= 334.2 | y= 592 | font=serif | "Total guaranteed payments" |
|||
TMPL | x= 453.3 | y= 592 | font=serif | "17" |
|||
TMPL | x= 471 | y= 592 | font=serif | "Alternative minimum tax (AMT) items" |
|||
TMPL | x= 318.5 | y= 568 | font=serif | "5" |
|||
TMPL | x= 334.2 | y= 568 | font=serif | "Interest income" |
|||
TMPL | x= 40.6 | y= 554.5 | font=serif | "C" |
|||
TMPL | x= 58.4 | y= 554.5 | font=serif | "IRS center where partnership filed return:" |
|||
DATA | x= 185.4 | y= 553.7 | font=sans-serif | "E-FILE" |
|||
TMPL | x= 40.6 | y= 543 | font=serif | "D" |
|||
TMPL | x= 72 | y= 543 | font=serif | "Check if this is a publicly traded partnership (PTP)" |
|||
TMPL | x= 316.4 | y= 544 | font=serif | "6a" |
|||
TMPL | x= 334.2 | y= 544 | font=serif | "Ordinary dividends" |
|||
TMPL | x= 46.9 | y= 530.9 | font=serif | "Part II" |
|||
TMPL | x= 86.4 | y= 530.9 | font=serif | "Information About the Partner" |
|||
TMPL | x= 40.9 | y= 518 | font=serif | "E" |
|||
TMPL | x= 316.3 | y= 520 | font=serif | "6b" |
|||
TMPL | x= 334.2 | y= 520 | font=serif | "Qualified dividends" |
|||
TMPL | x= 453.3 | y= 520 | font=serif | "18" |
|||
TMPL | x= 471 | y= 520 | font=serif | "Tax-exempt income and" |
|||
TMPL | x= 471 | y= 511.6 | font=serif | "nondeductible expenses" |
|||
TMPL | x= 41.1 | y= 494 | font=serif | "F" |
|||
TMPL | x= 316.4 | y= 496 | font=serif | "6c" |
|||
TMPL | x= 334.2 | y= 496 | font=serif | "Dividend equivalents" |
|||
TMPL | x= 318.5 | y= 472 | font=serif | "7" |
|||
TMPL | x= 334.2 | y= 472 | font=serif | "Royalties" |
|||
TMPL | x= 40.5 | y= 447 | font=serif | "G" |
|||
TMPL | x= 72 | y= 446.6 | font=serif | "General partner or LLC" |
|||
DATA | x= 180.3 | y= 446.6 | font=sans-serif | "X" |
|||
TMPL | x= 194.4 | y= 446.6 | font=serif | "Limited partner or other LLC" |
|||
TMPL | x= 318.5 | y= 448 | font=serif | "8" |
|||
TMPL | x= 334.2 | y= 448 | font=serif | "Net short-term capital gain (loss)" |
|||
TMPL | x= 72 | y= 438.2 | font=serif | "member-manager" |
|||
TMPL | x= 194.4 | y= 438.2 | font=serif | "member" |
|||
TMPL | x= 453.3 | y= 436 | font=serif | "19" |
|||
TMPL | x= 471 | y= 436 | font=serif | "Distributions" |
|||
TMPL | x= 38.7 | y= 423 | font=serif | "H1" |
|||
DATA | x= 58 | y= 422.9 | font=sans-serif | "X" |
|||
TMPL | x= 72 | y= 422 | font=serif | "Domestic partner" |
|||
TMPL | x= 194.4 | y= 422 | font=serif | "Foreign partner" |
|||
TMPL | x= 316.4 | y= 424 | font=serif | "9a" |
|||
TMPL | x= 334.2 | y= 424 | font=serif | "Net long-term capital gain (loss)" |
|||
DATA | x= 455.2 | y= 423.2 | font=sans-serif | "A" |
|||
DATA | x= 530.6 | y= 422 | font=sans-serif | "4,493,757" |
|||
TMPL | x= 38.7 | y= 411 | font=serif | "H2" |
|||
DATA | x= 57.9 | y= 410.5 | font=sans-serif | "X" |
|||
TMPL | x= 72 | y= 410 | font=serif | "If the partner is a disregarded entity (DE), enter the partnerΓÇÖs:" |
|||
TMPL | x= 57.6 | y= 398.1 | font=serif | "TIN" |
|||
TMPL | x= 144 | y= 398.1 | font=serif | "Name" |
|||
TMPL | x= 316.3 | y= 400 | font=serif | "9b" |
|||
TMPL | x= 334.2 | y= 400 | font=serif | "Collectibles (28%) gain (loss)" |
|||
TMPL | x= 40.2 | y= 386 | font=serif | "I1" |
|||
TMPL | x= 57.6 | y= 387 | font=serif | "What type of entity is this partner?" |
|||
TMPL | x= 453.3 | y= 388 | font=serif | "20" |
|||
TMPL | x= 471 | y= 388 | font=serif | "Other information" |
|||
TMPL | x= 40.2 | y= 374 | font=serif | "I2" |
|||
TMPL | x= 57.6 | y= 374 | font=serif | "If this partner is a retirement plan (IRA/SEP/Keogh/etc.), check here" |
|||
TMPL | x= 276 | y= 374 | font=serif | "." |
|||
TMPL | x= 316.4 | y= 376 | font=serif | "9c" |
|||
TMPL | x= 334.2 | y= 376 | font=serif | "Unrecaptured section 1250 gain" |
|||
TMPL | x= 41.3 | y= 362 | font=serif | "J" |
|||
TMPL | x= 57.6 | y= 362 | font=serif | "PartnerΓÇÖs share of profit, loss, and capital (see instructions):" |
|||
DATA | x= 455.2 | y= 362.8 | font=sans-serif | "A" |
|||
DATA | x= 525.6 | y= 362.8 | font=sans-serif | "SEE STMT" |
|||
TMPL | x= 110 | y= 352.5 | font=serif | "Beginning" |
|||
TMPL | x= 229.8 | y= 352.5 | font=serif | "Ending" |
|||
TMPL | x= 316.5 | y= 352 | font=serif | "10" |
|||
TMPL | x= 334.2 | y= 352 | font=serif | "Net section 1231 gain (loss)" |
|||
TMPL | x= 57.6 | y= 338 | font=serif | "Profit" |
|||
DATA | x= 139.1 | y= 339.1 | font=sans-serif | "3.032900" |
|||
TMPL | x= 183.7 | y= 338 | font=serif | "%" |
|||
DATA | x= 250.1 | y= 339.1 | font=sans-serif | "0.000000" |
|||
TMPL | x= 291.7 | y= 338 | font=serif | "%" |
|||
DATA | x= 455.2 | y= 338.5 | font=sans-serif | "B" |
|||
DATA | x= 525.6 | y= 339 | font=sans-serif | "SEE STMT" |
|||
TMPL | x= 57.6 | y= 326 | font=serif | "Loss" |
|||
DATA | x= 139.1 | y= 326.1 | font=sans-serif | "3.032900" |
|||
TMPL | x= 183.7 | y= 326 | font=serif | "%" |
|||
DATA | x= 250.1 | y= 326.1 | font=sans-serif | "0.000000" |
|||
TMPL | x= 291.7 | y= 326 | font=serif | "%" |
|||
TMPL | x= 316.5 | y= 328 | font=serif | "11" |
|||
TMPL | x= 334.2 | y= 328 | font=serif | "Other income (loss)" |
|||
TMPL | x= 57.6 | y= 314.5 | font=serif | "Capital" |
|||
DATA | x= 139.1 | y= 314.2 | font=sans-serif | "3.032900" |
|||
TMPL | x= 183.7 | y= 314 | font=serif | "%" |
|||
DATA | x= 250.1 | y= 314.2 | font=sans-serif | "0.000000" |
|||
TMPL | x= 291.7 | y= 314 | font=serif | "%" |
|||
DATA | x= 314.2 | y= 314.4 | font=sans-serif | "ZZ*" |
|||
DATA | x= 403.9 | y= 314.4 | font=sans-serif | "(409,615)" |
|||
DATA | x= 455.2 | y= 315.6 | font=sans-serif | "V" |
|||
DATA | x= 525.6 | y= 314.6 | font=sans-serif | "SEE STMT" |
|||
TMPL | x= 57.6 | y= 302 | font=serif | "Check if decrease is due to:" |
|||
TMPL | x= 72 | y= 290 | font=serif | "Sale" |
|||
TMPL | x= 89.9 | y= 290 | font=serif | "or" |
|||
TMPL | x= 115.2 | y= 290 | font=serif | "Exchange of partnership interest. See instructions." |
|||
DATA | x= 456.4 | y= 291.3 | font=sans-serif | "*" |
|||
DATA | x= 525.6 | y= 290.3 | font=sans-serif | "SEE STMT" |
|||
TMPL | x= 38.7 | y= 278 | font=serif | "K1" |
|||
TMPL | x= 57.6 | y= 278 | font=serif | "PartnerΓÇÖs share of liabilities:" |
|||
TMPL | x= 316.5 | y= 280 | font=serif | "12" |
|||
TMPL | x= 334.2 | y= 280 | font=serif | "Section 179 deduction" |
|||
TMPL | x= 453.3 | y= 280 | font=serif | "21" |
|||
TMPL | x= 471 | y= 280 | font=serif | "Foreign taxes paid or accrued" |
|||
TMPL | x= 160.6 | y= 268.5 | font=serif | "Beginning" |
|||
TMPL | x= 251.2 | y= 268.5 | font=serif | "Ending" |
|||
DATA | x= 456.4 | y= 267.1 | font=sans-serif | "*" |
|||
DATA | x= 555.6 | y= 266.1 | font=sans-serif | "196" |
|||
TMPL | x= 57.6 | y= 254 | font=serif | "Nonrecourse" |
|||
TMPL | x= 108 | y= 254 | font=serif | "." |
|||
TMPL | x= 120 | y= 254 | font=serif | "." |
|||
TMPL | x= 134.9 | y= 254 | font=serif | "$" |
|||
DATA | x= 180.8 | y= 254.5 | font=sans-serif | "498,211" |
|||
TMPL | x= 221.3 | y= 254 | font=serif | "$" |
|||
TMPL | x= 316.5 | y= 256 | font=serif | "13" |
|||
TMPL | x= 334.2 | y= 256 | font=serif | "Other deductions" |
|||
TMPL | x= 57.6 | y= 238.4 | font=serif | "Qualified nonrecourse" |
|||
TMPL | x= 57.6 | y= 230 | font=serif | "financing" |
|||
TMPL | x= 96 | y= 230 | font=serif | "." |
|||
TMPL | x= 108 | y= 230 | font=serif | "." |
|||
TMPL | x= 120 | y= 230 | font=serif | "." |
|||
TMPL | x= 134.9 | y= 230 | font=serif | "$" |
|||
TMPL | x= 221.3 | y= 230 | font=serif | "$" |
|||
TMPL | x= 57.6 | y= 218.5 | font=serif | "Recourse" |
|||
TMPL | x= 96 | y= 218.5 | font=serif | "." |
|||
TMPL | x= 108 | y= 218.5 | font=serif | "." |
|||
TMPL | x= 120 | y= 218.5 | font=serif | "." |
|||
TMPL | x= 134.9 | y= 218.5 | font=serif | "$" |
|||
TMPL | x= 221.3 | y= 218.5 | font=serif | "$" |
|||
TMPL | x= 38.7 | y= 207 | font=serif | "K2" |
|||
TMPL | x= 57.6 | y= 207 | font=serif | "Check this box if item K1 includes liability amounts from lower-tier partnerships" |
|||
DATA | x= 294.9 | y= 205.8 | font=sans-serif | "X" |
|||
TMPL | x= 38.7 | y= 195 | font=serif | "K3" |
|||
TMPL | x= 57.6 | y= 195 | font=serif | "Check if any of the above liability is subject to guarantees or other" |
|||
TMPL | x= 57.6 | y= 186 | font=serif | "payment obligations by the partner. See instructions" |
|||
TMPL | x= 228 | y= 186 | font=serif | "." |
|||
TMPL | x= 240 | y= 186 | font=serif | "." |
|||
TMPL | x= 252 | y= 186 | font=serif | "." |
|||
TMPL | x= 264 | y= 186 | font=serif | "." |
|||
TMPL | x= 276 | y= 186 | font=serif | "." |
|||
TMPL | x= 316.2 | y= 183.1 | font=serif | "22" |
|||
TMPL | x= 345.6 | y= 183 | font=serif | "More than one activity for at-risk purposes*" |
|||
TMPL | x= 41.1 | y= 170 | font=serif | "L" |
|||
TMPL | x= 122.3 | y= 170.5 | font=serif | "PartnerΓÇÖs Capital Account Analysis" |
|||
TMPL | x= 316.2 | y= 171.1 | font=serif | "23" |
|||
TMPL | x= 345.6 | y= 171 | font=serif | "More than one activity for passive activity purposes*" |
|||
TMPL | x= 57.6 | y= 158 | font=serif | "Beginning capital account" |
|||
TMPL | x= 156 | y= 158 | font=serif | "." |
|||
TMPL | x= 168 | y= 158 | font=serif | "." |
|||
TMPL | x= 180 | y= 158 | font=serif | "." |
|||
TMPL | x= 189.5 | y= 157.6 | font=serif | "$" |
|||
DATA | x= 257.8 | y= 157.4 | font=sans-serif | "4,903,568" |
|||
TMPL | x= 316.6 | y= 158.6 | font=serif | "*See attached statement for additional information." |
|||
TMPL | x= 57.6 | y= 146 | font=serif | "Capital contributed during the year" |
|||
TMPL | x= 168 | y= 146 | font=serif | "." |
|||
TMPL | x= 180 | y= 146 | font=serif | "." |
|||
TMPL | x= 189.5 | y= 145.6 | font=serif | "$" |
|||
TMPL | x= 57.6 | y= 134 | font=serif | "Current year net income (loss)" |
|||
TMPL | x= 156 | y= 134 | font=serif | "." |
|||
TMPL | x= 168 | y= 134 | font=serif | "." |
|||
TMPL | x= 180 | y= 134 | font=serif | "." |
|||
TMPL | x= 189.5 | y= 133.6 | font=serif | "$" |
|||
DATA | x= 259.3 | y= 133.7 | font=sans-serif | "(409,811)" |
|||
TMPL | x= 57.6 | y= 122 | font=serif | "Other increase (decrease) (attach explanation)" |
|||
TMPL | x= 189.5 | y= 121.6 | font=serif | "$" |
|||
TMPL | x= 57.6 | y= 110 | font=serif | "Withdrawals and distributions" |
|||
TMPL | x= 156 | y= 110 | font=serif | "." |
|||
TMPL | x= 168 | y= 110 | font=serif | "." |
|||
TMPL | x= 180 | y= 110 | font=serif | "." |
|||
TMPL | x= 189.5 | y= 109.6 | font=serif | "$" |
|||
TMPL | x= 195.4 | y= 110.5 | font=serif | "(" |
|||
DATA | x= 257.8 | y= 109.4 | font=sans-serif | "4,493,757" |
|||
TMPL | x= 300.4 | y= 110.5 | font=serif | ")" |
|||
TMPL | x= 57.6 | y= 99 | font=serif | "Ending capital account" |
|||
TMPL | x= 144 | y= 99 | font=serif | "." |
|||
TMPL | x= 156 | y= 99 | font=serif | "." |
|||
TMPL | x= 168 | y= 99 | font=serif | "." |
|||
TMPL | x= 180 | y= 99 | font=serif | "." |
|||
TMPL | x= 189.5 | y= 97.6 | font=serif | "$" |
|||
TMPL | x= 40 | y= 86 | font=serif | "M" |
|||
TMPL | x= 58.4 | y= 86 | font=serif | "Did the partner contribute property with a built-in gain (loss)?" |
|||
TMPL | x= 72 | y= 74 | font=serif | "Yes" |
|||
DATA | x= 101.2 | y= 74.2 | font=sans-serif | "X" |
|||
TMPL | x= 115.2 | y= 74 | font=serif | "No" |
|||
TMPL | x= 136.8 | y= 74 | font=serif | "If ΓÇ£Yes,ΓÇ¥ attach statement. See instructions." |
|||
TMPL | x= 40.6 | y= 62 | font=serif | "N" |
|||
TMPL | x= 70.1 | y= 62 | font=serif | "PartnerΓÇÖs Share of Net Unrecognized Section 704(c) Gain or (Loss)" |
|||
TMPL | x= 323 | y= 61.3 | font=serif | "For IRS Use Only" |
|||
TMPL | x= 57.6 | y= 51 | font=serif | "Beginning" |
|||
TMPL | x= 96 | y= 51 | font=serif | "." |
|||
TMPL | x= 108 | y= 51 | font=serif | "." |
|||
TMPL | x= 120 | y= 51 | font=serif | "." |
|||
TMPL | x= 132 | y= 51 | font=serif | "." |
|||
TMPL | x= 144 | y= 51 | font=serif | "." |
|||
TMPL | x= 156 | y= 51 | font=serif | "." |
|||
TMPL | x= 168 | y= 51 | font=serif | "." |
|||
TMPL | x= 180 | y= 51 | font=serif | "." |
|||
TMPL | x= 189.1 | y= 51 | font=serif | "$" |
|||
DATA | x= 271.5 | y= 49.7 | font=sans-serif | "(5,373)" |
|||
TMPL | x= 57.6 | y= 39 | font=serif | "Ending" |
|||
TMPL | x= 84 | y= 39 | font=serif | "." |
|||
TMPL | x= 96 | y= 39 | font=serif | "." |
|||
TMPL | x= 108 | y= 39 | font=serif | "." |
|||
TMPL | x= 120 | y= 39 | font=serif | "." |
|||
TMPL | x= 132 | y= 39 | font=serif | "." |
|||
TMPL | x= 144 | y= 39 | font=serif | "." |
|||
TMPL | x= 156 | y= 39 | font=serif | "." |
|||
TMPL | x= 168 | y= 39 | font=serif | "." |
|||
TMPL | x= 180 | y= 39 | font=serif | "." |
|||
TMPL | x= 189.1 | y= 39 | font=serif | "$" |
|||
TMPL | x= 36 | y= 26 | font=serif | "For Paperwork Reduction Act Notice, see the Instructions for Form 1065." |
|||
TMPL | x= 283.9 | y= 26 | font=serif | "www.irs.gov/Form1065" |
|||
TMPL | x= 362.7 | y= 26 | font=serif | "Cat. No. 11394R" |
|||
TMPL | x= 419.6 | y= 26 | font=serif | "Schedule K-1 (Form 1065) 2025" |
|||
TMPL | x= 524.9 | y= 26 | font=serif | "Created 2/26/25" |
|||
DATA | x= 285.6 | y= 5.5 | font=sans-serif | "Page 2 of 31" |
|||
DATA | x= 92.1 | y= 2.8 | font=sans-serif | "(409,811)" |
|||
|
|||
Done. |
|||
@ -0,0 +1,535 @@ |
|||
# Research: Normalized Relational Model for K-1 Financial Data |
|||
|
|||
**Phase 0 Output** | **Date**: 2026-03-20 | **Research Only — No Code** |
|||
|
|||
--- |
|||
|
|||
## Context |
|||
|
|||
The current system stores K-1 box data as a flat JSON blob on `KDocument.data`: |
|||
|
|||
```json |
|||
{"1": 50000, "9a": -1200, "11-ZZ*": 500, "20-A": 1200} |
|||
``` |
|||
|
|||
Aggregations are computed on-the-fly in `k1-aggregation.service.ts` by iterating JSON keys. `CellMapping` provides label metadata, and `CellAggregationRule` defines which box keys to SUM. The system currently has ~80+ possible K-1 fields (boxes 1–21 with subtypes, Sections J/K/L/M/N, metadata fields like A–I). |
|||
|
|||
The goal is to evaluate whether and how to transform this into a normalized relational model. |
|||
|
|||
--- |
|||
|
|||
## Topic 1: Wide vs Normalized Financial Data Models |
|||
|
|||
### Decision |
|||
|
|||
**Move to a normalized fact table** (`K1LineItem`) for Part III financial data (boxes 1–21), but **keep a JSON metadata column** for Part I/II identity fields (A–I, J–N) that are queried infrequently. |
|||
|
|||
### Rationale |
|||
|
|||
The current JSON blob approach has these specific weaknesses for analytics: |
|||
|
|||
**Query limitations observed in this codebase:** |
|||
1. **No SQL-level filtering or aggregation** — The `computeForKDocument()` method in `k1-aggregation.service.ts` must fetch the entire `KDocument` row, deserialize JSON, and loop through `Object.entries(data)` in application code. This means you cannot write `SELECT SUM(amount) FROM ... WHERE box_number = '1' AND tax_year BETWEEN 2020 AND 2025` — every aggregation requires fetching and deserializing all rows. |
|||
2. **No indexes on values** — Cannot index `data->'1'` effectively in PostgreSQL JSONB for range queries. While GIN indexes support containment (`@>`), they don't help with `>`, `<`, or `BETWEEN` on numeric values within the JSON. |
|||
3. **No referential integrity** — A typo like `"9A"` vs `"9a"` silently creates bad data. The current `CellMapping` table defines valid box numbers, but nothing enforces that `KDocument.data` keys match them. |
|||
4. **Cross-document aggregation is O(n) deserialization** — To compute "total ordinary income (Box 1) across all partnerships for 2025," every KDocument row matching the year must be fetched and parsed. With 50+ partnerships × 5 years, this is 250+ JSON deserializations for one number. |
|||
5. **No partial update tracking** — When a KDocument transitions from ESTIMATED → FINAL, the entire JSON blob is replaced. `previousData` preserves the old blob but provides no field-level diff. |
|||
6. **Schema evolution is invisible** — If the IRS adds a Box 6d in 2027, there's no migration — it just appears as a new JSON key. This sounds convenient but means no validation, no type checking, and no discoverability for future NL-to-SQL. |
|||
|
|||
**When the wide/JSON model is acceptable:** |
|||
- Archival storage of the complete raw extraction (already served by `K1ImportSession.rawExtraction`) |
|||
- Rarely-queried metadata fields (Part I/II: partnership name, EIN, addresses) |
|||
- Configurations and user preferences (already used for `Settings.settings`) |
|||
- Fewer than ~10 documents with no cross-document queries needed |
|||
|
|||
**When it breaks down (the current situation):** |
|||
- Cross-entity/cross-year aggregation (core family office use case) |
|||
- Performance analytics over time (partnership returns by year) |
|||
- Tax planning queries ("show me all partnerships with Section 1231 losses > $10K") |
|||
- Audit trail at field granularity |
|||
- LLM-generated SQL queries (LLMs cannot reliably generate JSONB path expressions) |
|||
|
|||
### Alternatives Considered |
|||
|
|||
| Alternative | Pros | Cons | |
|||
|---|---|---| |
|||
| **Keep JSON blob** (status quo) | No migration, flexible schema | All query limitations above; blocks analytics roadmap | |
|||
| **JSONB with generated columns** | No schema change for K-1 fields; PostgreSQL 12+ supports `GENERATED ALWAYS AS (data->>'1')::numeric` | Max ~30 generated columns practical; doesn't scale to 80+ fields; still no FK integrity | |
|||
| **Wide table with 80+ columns** | Simple queries, strong typing | Extremely sparse (most K-1s populate ~20 of 80+ boxes); ALTER TABLE for every IRS form change; NULL-heavy | |
|||
| **Normalized fact table** (chosen) | SQL aggregation, indexes, FK integrity, LLM-friendly, field-level audit trail | More JOINs; migration effort; slightly more complex insert logic | |
|||
|
|||
--- |
|||
|
|||
## Topic 2: EAV vs Normalized Tables for Tax Document Fields |
|||
|
|||
### Decision |
|||
|
|||
**Use a hybrid approach**: a single EAV-style fact table (`K1LineItem`) for all Part III financial line items, combined with a reference/dimension table (`K1BoxDefinition`) that provides metadata, typing, and validation rules. Keep Part I/II identity metadata as structured JSON on the KDocument. |
|||
|
|||
This is technically EAV but with strong constraints — it's closer to a **typed fact table** pattern than classic unconstrained EAV. |
|||
|
|||
### Rationale |
|||
|
|||
**Why EAV is appropriate here (and usually isn't):** |
|||
|
|||
Classic EAV fails because it loses type safety, makes queries verbose, and resists validation. K-1 data avoids these pitfalls because: |
|||
|
|||
1. **Uniform value type** — All Part III financial values (boxes 1–21) are `Decimal` amounts. Unlike generic EAV where attributes might be strings, dates, booleans, or blobs, K-1 line items are uniformly monetary amounts with a known currency. This eliminates the "value_string / value_number / value_date" anti-pattern. |
|||
|
|||
2. **Closed attribute set** — The IRS defines ~50 Part III line items. This is not open-ended. The `K1BoxDefinition` reference table enumerates all valid attributes, so there's no unbounded attribute sprawl. |
|||
|
|||
3. **Natural query pattern** — The primary queries are aggregations across one attribute dimension: `SUM(amount) WHERE box_key = '1'`. This is exactly what EAV is good at — pivot-style aggregation across a known set of attributes. |
|||
|
|||
4. **Sparse data** — A typical K-1 populates 15–25 of ~50 possible line items. A wide table would be 50–70% NULL. The EAV/fact table stores only populated fields, which is both space-efficient and semantically clearer. |
|||
|
|||
**Proposed structure (conceptual):** |
|||
|
|||
``` |
|||
K1BoxDefinition (reference/dimension table) |
|||
├── boxKey VARCHAR PK -- "1", "9a", "11-ZZ*", "20-A" |
|||
├── label VARCHAR -- "Ordinary business income (loss)" |
|||
├── section VARCHAR -- "PART_III", "PART_I", "SECTION_J" |
|||
├── dataType VARCHAR -- "CURRENCY", "PERCENTAGE", "BOOLEAN", "TEXT" |
|||
├── sortOrder INT |
|||
├── irsFormLine VARCHAR -- "Box 1", "Box 9a", "Section J, Line 1" |
|||
└── description TEXT |
|||
|
|||
K1LineItem (fact table — one row per box per KDocument) |
|||
├── id UUID PK |
|||
├── kDocumentId UUID FK → KDocument.id |
|||
├── boxKey VARCHAR FK → K1BoxDefinition.boxKey |
|||
├── amount DECIMAL(15,2) -- financial value (null for non-monetary) |
|||
├── textValue VARCHAR -- for text/boolean fields if needed |
|||
├── sourceConfidence DECIMAL(3,2) -- 0.00–1.00, from extraction |
|||
├── sourcePageNumber INT -- PDF page where extracted |
|||
├── sourceCoordinates JSON -- {x, y, width, height} on the page |
|||
├── isUserEdited BOOLEAN -- true if user modified during verification |
|||
├── createdAt TIMESTAMP |
|||
├── updatedAt TIMESTAMP |
|||
└── @@unique([kDocumentId, boxKey]) |
|||
``` |
|||
|
|||
**Why not separate normalized tables for each box category:** |
|||
|
|||
An alternative is dedicated tables: `K1IncomeItems`, `K1DeductionItems`, `K1CreditItems`, `K1CapitalAccount`, etc. This was rejected because: |
|||
- K-1 boxes don't cleanly partition into fixed categories (Box 11 "Other income" spans multiple categories via sub-codes) |
|||
- Sub-code boxes (11-A through 11-ZZ*, 13-A through 13-ZZ*, 20-A through 20-ZZ*) have partnership-specific meaning — the same structural pattern repeats across boxes |
|||
- It would require 6–8 tables with identical column shapes, making queries harder, not easier |
|||
- The `K1BoxDefinition` reference table provides the categorical metadata without needing separate physical tables |
|||
|
|||
**Treatment of Part I/II metadata fields:** |
|||
|
|||
Fields like Partnership EIN (Box A), Partner name (Box F), Section J percentages, and Section L capital account data are better stored as structured JSON on `KDocument` in a `metadata` column because: |
|||
- They're queried for display, not for aggregation |
|||
- They have heterogeneous types (strings, booleans, percentages, addresses) |
|||
- They identify the document rather than representing financial facts |
|||
- There are ~30 of them, and they're almost all populated (not sparse) |
|||
|
|||
### Alternatives Considered |
|||
|
|||
| Alternative | Pros | Cons | |
|||
|---|---|---| |
|||
| **Pure EAV (no reference table)** | Maximum flexibility | No validation of box keys; `CellMapping` already serves this role but without FK enforcement | |
|||
| **Wide table (one column per box)** | Simple SELECTs for specific boxes | 80+ columns; 50–70% NULLs; ALTER TABLE for new boxes; poor for cross-box aggregation | |
|||
| **Separate tables per box category** | Strong typing per category | 6–8 near-identical tables; complex UNION queries; sub-code boxes don't fit cleanly | |
|||
| **Hybrid EAV + reference table** (chosen) | Uniform fact table; strong FK validation; sparse-friendly; single query pattern for aggregation; field-level provenance | Pivot queries needed for "show one K-1 as a form"; slightly more complex writes | |
|||
|
|||
--- |
|||
|
|||
## Topic 3: Financial Fact Tables for Tax Data |
|||
|
|||
### Decision |
|||
|
|||
**Model K-1 line items as a financial fact table** in a star-schema-inspired design, with KDocument as the central bridge to dimension tables (Partnership, Entity, TaxYear). Monetary values stored as `DECIMAL(15,2)` with explicit currency. |
|||
|
|||
### Rationale |
|||
|
|||
Financial data warehouses consistently use a fact/dimension pattern for tax line items: |
|||
|
|||
**Star schema mapping for K-1 data:** |
|||
|
|||
``` |
|||
┌──────────────┐ |
|||
│ Partnership │ (dimension) |
|||
│ ────────── │ |
|||
│ id, name, │ |
|||
│ type, ein │ |
|||
└──────┬───────┘ |
|||
│ |
|||
┌──────────────┐ ┌──────┴───────┐ ┌──────────────────┐ |
|||
│ Entity │────│ KDocument │────│ K1BoxDefinition │ (dimension) |
|||
│ (dimension) │ │ (bridge) │ │ ────────────────│ |
|||
│ ────────── │ │ ────────── │ │ boxKey, label, │ |
|||
│ id, name, │ │ id, taxYear,│ │ section, type │ |
|||
│ type, taxId │ │ status │ └──────────────────┘ |
|||
└──────────────┘ └──────┬───────┘ |
|||
│ |
|||
┌──────┴───────┐ |
|||
│ K1LineItem │ (FACT) |
|||
│ ────────── │ |
|||
│ amount, │ |
|||
│ boxKey, │ |
|||
│ confidence │ |
|||
└──────────────┘ |
|||
``` |
|||
|
|||
**Best practices from financial data warehousing applied here:** |
|||
|
|||
1. **Additive facts only** — `K1LineItem.amount` is fully additive: you can SUM across tax years, partnerships, entities, or box types. Non-additive data (percentages, booleans, text) is stored separately in `textValue` or on the KDocument metadata. |
|||
|
|||
2. **Grain = one box value per K-1 document** — Each row in `K1LineItem` represents one financial amount from one K-1 for one tax year. This is the atomic grain. Aggregation rules from `CellAggregationRule` operate on this grain. |
|||
|
|||
3. **Slowly changing dimensions** — `PartnershipMembership` already handles SCD Type 2 (effective dates) for ownership percentages. `K1BoxDefinition` is SCD Type 1 (overwritten on IRS form changes, with version tracking if needed). |
|||
|
|||
4. **Conformed dimensions** — `Partnership` and `Entity` serve as conformed dimensions shared between K-1 facts, Distribution facts, and Valuation facts. A single `Entity` dimension joins to multiple fact tables. |
|||
|
|||
5. **Currency handling** — Store amounts in the source currency with a `currency` column. The KDocument inherits currency from Partnership. Conversion to reporting currency happens at query time or in materialized views, never by mutating the fact. |
|||
|
|||
6. **Decimal precision** — `DECIMAL(15,2)` covers amounts up to $9,999,999,999,999.99. K-1 amounts from large partnerships (PE funds, hedge funds) can reach tens of millions. 15 digits provides headroom. Use 2 decimal places to match IRS reporting precision. |
|||
|
|||
**Aggregation queries enabled by this model:** |
|||
|
|||
```sql |
|||
-- Total ordinary income across all partnerships for 2025 |
|||
SELECT SUM(li.amount) |
|||
FROM k1_line_item li |
|||
JOIN k_document kd ON li.k_document_id = kd.id |
|||
WHERE li.box_key = '1' AND kd.tax_year = 2025; |
|||
|
|||
-- Income breakdown by entity for tax year 2025 |
|||
SELECT e.name, li.box_key, SUM(li.amount) |
|||
FROM k1_line_item li |
|||
JOIN k_document kd ON li.k_document_id = kd.id |
|||
JOIN partnership p ON kd.partnership_id = p.id |
|||
JOIN partnership_membership pm ON pm.partnership_id = p.id |
|||
JOIN entity e ON pm.entity_id = e.id |
|||
WHERE kd.tax_year = 2025 |
|||
GROUP BY e.name, li.box_key; |
|||
|
|||
-- Partnership performance: Box 1 over time |
|||
SELECT kd.tax_year, p.name, li.amount |
|||
FROM k1_line_item li |
|||
JOIN k_document kd ON li.k_document_id = kd.id |
|||
JOIN partnership p ON kd.partnership_id = p.id |
|||
WHERE li.box_key = '1' |
|||
ORDER BY kd.tax_year; |
|||
``` |
|||
|
|||
These queries are impossible or impractical with the current JSON blob model. |
|||
|
|||
### Alternatives Considered |
|||
|
|||
| Alternative | Pros | Cons | |
|||
|---|---|---| |
|||
| **Snowflake schema (more normalization)** | Normalized box categories into sub-dimensions | Over-normalized for ~50 box types; extra JOINs for no benefit | |
|||
| **Flat denormalized reporting table** | Fastest reads; no JOINs | Write complexity; data duplication; hard to keep consistent | |
|||
| **OLAP cube / column store** | Best aggregation performance | Overkill for <10K rows; adds infrastructure complexity | |
|||
| **Star-schema-inspired fact table** (chosen) | Natural fit for K-1 aggregation queries; leverages existing dimensions; PostgreSQL handles this scale trivially | Requires JOINs for full context (acceptable) | |
|||
|
|||
--- |
|||
|
|||
## Topic 4: Source Traceability in Financial Systems |
|||
|
|||
### Decision |
|||
|
|||
**Store extraction provenance at the line-item grain** — each `K1LineItem` records the source page number, bounding-box coordinates, raw extracted text, confidence score, and whether it was user-edited. The `K1ImportSession` retains the complete raw extraction as an immutable JSON snapshot. |
|||
|
|||
### Rationale |
|||
|
|||
The audit trail must support this flow: |
|||
|
|||
``` |
|||
Displayed aggregated number |
|||
→ K1LineItem (individual box value) |
|||
→ KDocument (which K-1, which year, which partnership) |
|||
→ K1ImportSession (extraction record) |
|||
→ Document (source PDF file) |
|||
→ Specific page + coordinates on that page |
|||
→ Raw extracted text before parsing |
|||
``` |
|||
|
|||
**Granularity levels and what to store where:** |
|||
|
|||
| Level | Table | Fields | Purpose | |
|||
|---|---|---|---| |
|||
| **Aggregation** | Computed at query time | SUM/formula from `CellAggregationRule` | "Where does this total come from?" → list of K1LineItems | |
|||
| **Line item** | `K1LineItem` | `amount`, `boxKey`, `sourceConfidence`, `sourcePageNumber`, `sourceCoordinates`, `rawExtractedText`, `isUserEdited` | "What exactly was extracted and from where?" | |
|||
| **Document** | `K1ImportSession` | `rawExtraction` (full JSON), `extractionMethod`, `fileName` | "What did the system originally see?" (immutable after extraction) | |
|||
| **File** | `Document` | `filePath`, `fileSize`, `mimeType` | "Where is the original PDF?" | |
|||
|
|||
**Key design principles:** |
|||
|
|||
1. **Immutability of raw extraction** — `K1ImportSession.rawExtraction` is written once at extraction time and never modified. `verifiedData` captures user edits. This provides a complete before/after audit trail. |
|||
|
|||
2. **Coordinate-level provenance** — Current `k1-positions-dump.txt` shows the parser already extracts `x, y` coordinates for each text element. Storing `sourceCoordinates: {x, y, width, height}` on each `K1LineItem` enables a future "click to highlight in PDF" feature. |
|||
|
|||
3. **Confidence as first-class data** — The system already computes confidence scores (0.0–1.0) during extraction. Persisting this on the line item (not just in the import session JSON) enables queries like "show me all low-confidence values across all partnerships" and supports audit prioritization. |
|||
|
|||
4. **User edit tracking** — `isUserEdited: boolean` distinguishes machine-extracted values from human-verified overrides. This is critical for audit and for training future extraction models. |
|||
|
|||
5. **No deletion of source data** — When a KDocument transitions from ESTIMATED → FINAL, the old line items should be soft-versioned (via `KDocument.previousData` or a separate version table), not deleted. |
|||
|
|||
**What NOT to store at line-item level:** |
|||
- Full PDF binary (stay on Document/filesystem) |
|||
- Complete OCR output for the entire page (stay on K1ImportSession.rawExtraction) |
|||
- Rendering coordinates for non-K-1 text on the page (not relevant) |
|||
|
|||
### Alternatives Considered |
|||
|
|||
| Alternative | Pros | Cons | |
|||
|---|---|---| |
|||
| **Provenance only at document level** | Simpler; fewer columns | Cannot trace an individual number back to a specific location on a page | |
|||
| **Separate provenance table** (K1LineItemProvenance) | Clean separation of concerns | Extra JOIN for every audit query; 1:1 relationship is usually better as columns | |
|||
| **Store full page image crops per line item** | Visual proof | Massive storage; PDF coordinates + original file are sufficient for re-rendering | |
|||
| **Provenance on line item** (chosen) | Direct traceability; no extra JOINs; enables "highlight in PDF"; supports audit queries | Slightly wider rows (acceptable for <10K rows) | |
|||
|
|||
--- |
|||
|
|||
## Topic 5: PostgreSQL Materialized Views for Financial Reporting |
|||
|
|||
### Decision |
|||
|
|||
**Use materialized views for cross-partnership/cross-year aggregation dashboards**, refreshed on a schedule or triggered by KDocument changes. Use regular views for single-document or single-partnership queries. Do **not** use denormalized reporting tables. |
|||
|
|||
### Rationale |
|||
|
|||
**When to use each approach in this system:** |
|||
|
|||
| Scenario | Approach | Reason | |
|||
|---|---|---| |
|||
| "Show Box 1–21 for one K-1" | Regular query on `K1LineItem` | Small result set; no aggregation; fast enough | |
|||
| "Total income by box for one partnership across years" | Regular SQL `GROUP BY` | <20 rows × <10 years = <200 rows; trivial for PostgreSQL | |
|||
| "Dashboard: all partnerships × all entities × 5 years" | **Materialized view** | Cross-joins across dimensions; 50 partnerships × 5 entities × 5 years × 20 boxes = 25,000 aggregated values; worth pre-computing | |
|||
| "Tax planning: find partnerships with specific loss patterns" | Materialized view or indexed view | Complex filtering across many K-1s | |
|||
| "YoY change in Box 1 by partnership" | Materialized view | Window functions over multiple years | |
|||
|
|||
**Proposed materialized views:** |
|||
|
|||
```sql |
|||
-- MV 1: K-1 Summary by Partnership/Year |
|||
CREATE MATERIALIZED VIEW mv_k1_partnership_year_summary AS |
|||
SELECT |
|||
kd.partnership_id, |
|||
kd.tax_year, |
|||
li.box_key, |
|||
bd.label, |
|||
bd.section, |
|||
SUM(li.amount) AS total_amount, |
|||
COUNT(*) AS line_count, |
|||
kd.filing_status |
|||
FROM k1_line_item li |
|||
JOIN k_document kd ON li.k_document_id = kd.id |
|||
JOIN k1_box_definition bd ON li.box_key = bd.box_key |
|||
GROUP BY kd.partnership_id, kd.tax_year, li.box_key, bd.label, bd.section, kd.filing_status; |
|||
|
|||
-- MV 2: Entity-level Income Aggregation |
|||
CREATE MATERIALIZED VIEW mv_entity_income_summary AS |
|||
SELECT |
|||
e.id AS entity_id, |
|||
e.name AS entity_name, |
|||
kd.tax_year, |
|||
li.box_key, |
|||
SUM(li.amount * pm.ownership_percent / 100) AS allocated_amount |
|||
FROM k1_line_item li |
|||
JOIN k_document kd ON li.k_document_id = kd.id |
|||
JOIN partnership_membership pm ON pm.partnership_id = kd.partnership_id |
|||
JOIN entity e ON pm.entity_id = e.id |
|||
WHERE pm.effective_date <= make_date(kd.tax_year, 12, 31) |
|||
AND (pm.end_date IS NULL OR pm.end_date > make_date(kd.tax_year, 12, 31)) |
|||
GROUP BY e.id, e.name, kd.tax_year, li.box_key; |
|||
``` |
|||
|
|||
**Refresh strategy:** |
|||
|
|||
- **Trigger-based refresh**: After any KDocument insert/update/delete or status change to FINAL, refresh affected materialized views. In NestJS, this is a `@OnEvent('k-document.changed')` handler that calls `REFRESH MATERIALIZED VIEW CONCURRENTLY`. |
|||
- **`CONCURRENTLY` keyword**: Allows reads during refresh (requires a unique index on the MV). Essential for a multi-user system. |
|||
- **Frequency**: For a family office with <100 K-1s updated per year, refresh takes <1 second. No scheduling needed — event-driven refresh is sufficient. |
|||
|
|||
**Why not denormalized reporting tables:** |
|||
|
|||
Denormalized tables (duplicating data into a flat reporting structure) require write-time consistency management — every KDocument change must update the reporting table transactionally. This is the pattern used in high-write OLTP systems, but K-1 data is low-write (<100 writes/year) and high-read (dashboards queried many times). Materialized views handle this perfectly with zero application-level sync logic. |
|||
|
|||
**Why not computed/generated columns:** |
|||
|
|||
PostgreSQL generated columns cannot reference other tables. Since aggregations span KDocument → K1LineItem → Partnership → Entity, generated columns are structurally insufficient. |
|||
|
|||
### Alternatives Considered |
|||
|
|||
| Alternative | Pros | Cons | |
|||
|---|---|---| |
|||
| **Application-level caching** (Redis/in-memory) | No DB schema changes | Cache invalidation complexity; doesn't help SQL-based analytics | |
|||
| **Denormalized reporting tables** | Fastest reads; works at any scale | Write-time maintenance burden; consistency bugs; overkill for <10K rows | |
|||
| **Regular views** (not materialized) | Always fresh; no refresh needed | Recomputed on every query; slow for cross-entity dashboards | |
|||
| **Materialized views** (chosen) | Pre-computed; concurrent reads; event-driven refresh; zero application-level sync | Slight staleness (mitigated by event-driven refresh); requires unique indexes for CONCURRENTLY | |
|||
|
|||
--- |
|||
|
|||
## Topic 6: Migration Strategy from JSON Blob to Normalized Tables |
|||
|
|||
### Decision |
|||
|
|||
**Phase the migration in 3 steps**: (1) Create new tables alongside existing JSON, (2) Dual-write to both during a transition period, (3) Make normalized tables authoritative. **Keep the JSON blob immutable as an archive** — never delete it. |
|||
|
|||
### Rationale |
|||
|
|||
**Step 1: Additive schema changes (zero breaking changes)** |
|||
|
|||
``` |
|||
Migration 1: Create K1BoxDefinition table, seed with IRS default box definitions |
|||
Migration 2: Create K1LineItem table with FK to KDocument and K1BoxDefinition |
|||
Migration 3: Backfill K1LineItem from existing KDocument.data JSON blobs |
|||
``` |
|||
|
|||
The backfill migration for Step 3: |
|||
|
|||
```sql |
|||
-- Pseudocode: For each KDocument, iterate JSON keys and insert K1LineItems |
|||
INSERT INTO k1_line_item (id, k_document_id, box_key, amount, created_at, updated_at) |
|||
SELECT |
|||
gen_random_uuid(), |
|||
kd.id, |
|||
je.key, |
|||
(je.value)::decimal, |
|||
kd.created_at, |
|||
NOW() |
|||
FROM k_document kd, |
|||
jsonb_each(kd.data::jsonb) AS je(key, value) |
|||
WHERE jsonb_typeof(je.value) = 'number'; |
|||
``` |
|||
|
|||
**Step 2: Dual-write transition period** |
|||
|
|||
During the transition: |
|||
- `k1-import.service.ts` `confirmImport()` writes to **both** `KDocument.data` (JSON) and `K1LineItem` (rows) |
|||
- Read operations gradually migrate from JSON-based to K1LineItem-based |
|||
- `k1-aggregation.service.ts` switches from JSON iteration to `SELECT SUM` on K1LineItem |
|||
- Run validation queries comparing JSON-derived totals to K1LineItem-derived totals |
|||
|
|||
**Step 3: K1LineItem becomes authoritative** |
|||
|
|||
- New features (dashboards, tax planning, LLM queries) read only from K1LineItem |
|||
- `KDocument.data` is retained as immutable archive but no longer written to for new documents |
|||
- `CellAggregationRule.sourceCells` continues to work — the boxKey values are the same strings |
|||
- `CellMapping` evolves into or is replaced by `K1BoxDefinition` |
|||
|
|||
**Should the old JSON be kept immutable?** |
|||
|
|||
**Yes, permanently.** Reasons: |
|||
1. **Audit requirement** — The JSON blob is the original imported representation. Regulatory and audit standards require preserving source data in its original form. |
|||
2. **Rollback safety** — If the migration has bugs, the JSON blob is the recovery source. |
|||
3. **Storage is trivial** — A JSON blob with ~30 key-value pairs is <1 KB. Even 1,000 KDocuments = <1 MB total. There's no storage pressure to delete it. |
|||
4. **Import session already preserves extraction** — `K1ImportSession.rawExtraction` holds the pre-verification extraction. `KDocument.data` holds the post-verification snapshot. Both should survive indefinitely. |
|||
|
|||
**Backward compatibility considerations:** |
|||
|
|||
- The `KDocument.data` column type stays `Json` (not nullable, not removed) |
|||
- The existing `k-document-form.component.ts` UI reads from `KDocument.data` — it continues to work during transition |
|||
- The `computeForKDocument()` aggregation service works against JSON through the transition, then switches to K1LineItem queries |
|||
- No existing API contracts change — `GET /k-documents/:id` returns the same shape |
|||
|
|||
**Handling the CellMapping → K1BoxDefinition transition:** |
|||
|
|||
The existing `CellMapping` table (per-partnership box definitions) maps closely to the proposed `K1BoxDefinition`. The migration strategy: |
|||
- `K1BoxDefinition` absorbs the global (partnershipId = null) CellMapping records |
|||
- Per-partnership CellMapping overrides become per-partnership `K1BoxDefinition` rows (or remain as display-layer configuration separate from the data model) |
|||
- `CellMapping` fields like `isIgnored`, `isCustom` are presentation concerns that may not belong on the data-layer `K1BoxDefinition` |
|||
|
|||
### Alternatives Considered |
|||
|
|||
| Alternative | Pros | Cons | |
|||
|---|---|---| |
|||
| **Big-bang migration** (drop JSON, create tables, migrate in one step) | Clean; no dual-write complexity | Risk of data loss; requires full feature freeze; hard to validate | |
|||
| **Dual-write indefinitely** | Maximum safety | Permanent write overhead; divergence risk between JSON and rows | |
|||
| **Keep JSON as authoritative, add views** | No migration of writes | Doesn't solve the core query limitation; views over JSONB are slow | |
|||
| **Phased migration with immutable archive** (chosen) | Zero-downtime; incremental validation; rollback possible; preserves audit trail | Dual-write period adds complexity (bounded to weeks, not permanent) | |
|||
|
|||
--- |
|||
|
|||
## Topic 7: Schema Design for Future LLM NL-to-SQL |
|||
|
|||
### Decision |
|||
|
|||
**Design tables with self-documenting names, add PostgreSQL `COMMENT ON` annotations for every table and column, use consistent naming conventions, and avoid ambiguity between similarly-named entities.** |
|||
|
|||
### Rationale |
|||
|
|||
LLMs generating SQL (via text-to-SQL or NL-to-SQL) work by receiving the schema as context and mapping natural language to table/column references. The schema itself is the prompt. Research from the Spider benchmark (Yale), BIRD benchmark, and production NL-to-SQL systems (e.g., Vanna.ai, DataHerald) identifies these factors as most impactful: |
|||
|
|||
**1. Naming conventions that LLMs parse correctly:** |
|||
|
|||
| Current Name | Problem | Proposed Name | Why Better | |
|||
|---|---|---|---| |
|||
| `KDocument` | "K" is ambiguous to LLMs | `k1_document` | Explicitly says "K-1" | |
|||
| `KDocument.data` | "data" is the most generic possible name | `k1_document.raw_data_json` | Describes what it holds | |
|||
| `K1LineItem.amount` | Could be confused with Distribution.amount | `k1_line_item.reported_amount` | Disambiguates | |
|||
| `CellMapping` | "Cell" is a spreadsheet term, not a tax term | `k1_box_definition` | Domain-specific | |
|||
| `CellAggregationRule` | LLMs may not connect "cell" to K-1 boxes | `k1_aggregation_rule` | Clearer context | |
|||
|
|||
**Naming conventions to adopt:** |
|||
- `snake_case` for all table and column names (PostgreSQL convention; LLMs trained on more snake_case SQL than camelCase) |
|||
- Prefix K-1-specific tables with `k1_` to create a namespace |
|||
- Use `_id` suffix for all foreign keys |
|||
- Avoid abbreviations (`partnership_id` not `ptnr_id`) |
|||
- Use `_at` suffix for timestamps (`created_at`, `updated_at`) |
|||
- Use descriptive names over short names (`tax_year` not `yr`, `filing_status` not `status`) |
|||
|
|||
**2. PostgreSQL COMMENT annotations:** |
|||
|
|||
```sql |
|||
COMMENT ON TABLE k1_line_item IS 'Individual financial line item from an IRS Schedule K-1 (Form 1065). One row per box number per K-1 document.'; |
|||
COMMENT ON COLUMN k1_line_item.box_key IS 'IRS K-1 box identifier such as "1" for ordinary income, "9a" for long-term capital gains, or "20-A" for other information code A.'; |
|||
COMMENT ON COLUMN k1_line_item.reported_amount IS 'Dollar amount reported on this K-1 line item, in the partnership base currency. Negative values represent losses.'; |
|||
COMMENT ON TABLE k1_box_definition IS 'Reference table of IRS Schedule K-1 box definitions. Maps box identifiers to human-readable labels and categories.'; |
|||
``` |
|||
|
|||
LLM NL-to-SQL systems extract these comments as schema context. A model asked "what is total ordinary income?" can map "ordinary income" → `k1_box_definition.label = 'Ordinary business income (loss)'` → `box_key = '1'` → join to `k1_line_item`. |
|||
|
|||
**3. Avoiding ambiguity:** |
|||
|
|||
Current pain points for LLM-generated SQL: |
|||
- `Distribution.amount` vs `K1LineItem.amount` — an LLM asked "total distributions" might query the wrong table. Solution: `k1_line_item.reported_amount` vs `distribution.distribution_amount`. |
|||
- `Partnership` has `distributions`, `kDocuments`, `valuations` — naming all FK columns `partnership_id` is correct and expected by LLMs. |
|||
- `Entity` is overloaded (database entities, legal entities). The table comment must clarify: "A legal person or structure (trust, LLC, individual) that owns assets and receives K-1 allocations." |
|||
|
|||
**4. Schema metadata table for LLM context:** |
|||
|
|||
Consider a lightweight `schema_metadata` table or a markdown document that provides the LLM with: |
|||
- Table relationships in natural language |
|||
- Common query patterns with examples |
|||
- Business rules ("Box 19a distributions are allocated to entities by ownership percentage") |
|||
- Valid values for enum columns |
|||
|
|||
This is cheaper than fine-tuning and more maintainable than few-shot prompts. |
|||
|
|||
**5. Avoid patterns that confuse LLMs:** |
|||
|
|||
| Anti-pattern | Why It Confuses LLMs | Alternative | |
|||
|---|---|---| |
|||
| JSON columns for queryable data | LLMs generate `->` / `->>` operators inconsistently | Normalized columns | |
|||
| Composite primary keys | LLMs often forget one part of the key in JOINs | Surrogate UUID PK + unique constraint | |
|||
| Polymorphic FKs (one FK, multiple target tables) | LLMs can't determine which table to JOIN | Separate FK columns | |
|||
| Generic column names (`type`, `status`, `data`, `value`) | Ambiguous across tables | Prefix with table context (`filing_status`, `box_data_type`) | |
|||
| Soft deletes (`is_deleted`) | LLMs forget the `WHERE is_deleted = false` filter | Use `end_date IS NULL` pattern (already in use for memberships) | |
|||
|
|||
### Alternatives Considered |
|||
|
|||
| Alternative | Pros | Cons | |
|||
|---|---|---| |
|||
| **No schema changes for LLM** | No work | LLM accuracy drops significantly with ambiguous/generic names; JSONB columns are nearly unusable for NL-to-SQL | |
|||
| **Fine-tune LLM on this schema** | Can handle any naming convention | Expensive; needs retraining on every schema change; vendor lock-in | |
|||
| **RAG over schema docs** | Flexible; schema-aware | Still limited by underlying schema quality; garbage-in-garbage-out | |
|||
| **Self-documenting schema + COMMENT annotations** (chosen) | Works with any LLM; zero runtime cost; maintainable; improves human readability too | Requires discipline to maintain comments on schema changes | |
|||
|
|||
--- |
|||
|
|||
## Summary of Decisions |
|||
|
|||
| # | Topic | Decision | |
|||
|---|---|---| |
|||
| 1 | Wide vs Normalized | Normalized fact table for Part III financial data; JSON retained for Part I/II metadata | |
|||
| 2 | EAV vs Normalized | Hybrid: typed EAV fact table (`K1LineItem`) with reference dimension (`K1BoxDefinition`); uniform `DECIMAL` value type avoids classic EAV pitfalls | |
|||
| 3 | Financial fact tables | Star-schema-inspired design with `K1LineItem` as fact, `KDocument`/`Partnership`/`Entity` as dimensions | |
|||
| 4 | Source traceability | Per-line-item provenance (page, coordinates, confidence, raw text, user-edit flag); K1ImportSession.rawExtraction as immutable full extraction archive | |
|||
| 5 | Materialized views | Event-driven materialized views for cross-entity dashboards; regular queries for single-document access | |
|||
| 6 | Migration strategy | 3-phase: additive tables → dual-write → K1LineItem authoritative; JSON blob kept immutable forever | |
|||
| 7 | LLM NL-to-SQL | Self-documenting `snake_case` names, `COMMENT ON` annotations, disambiguation of similar columns, `k1_` table prefix namespace | |
|||
@ -0,0 +1,21 @@ |
|||
import { PrismaClient } from '@prisma/client'; |
|||
const p = new PrismaClient(); |
|||
|
|||
// Delete all data in dependency order
|
|||
await p.access.deleteMany(); |
|||
await p.order.deleteMany(); |
|||
await p.accountBalance.deleteMany(); |
|||
await p.account.deleteMany(); |
|||
await p.symbolProfile.deleteMany(); |
|||
await p.marketData.deleteMany(); |
|||
await p.settings.deleteMany(); |
|||
await p.subscription.deleteMany(); |
|||
await p.authDevice.deleteMany(); |
|||
await p.analytics.deleteMany(); |
|||
await p.user.deleteMany(); |
|||
|
|||
console.log('All users deleted.'); |
|||
|
|||
const users = await p.user.findMany({ select: { id: true, role: true } }); |
|||
console.log('USERS after delete:', JSON.stringify(users)); |
|||
await p.$disconnect(); |
|||
Loading…
Reference in new issue