Update ledgerai to read existing transactions from beancount file

2025-12-20 15:33:08 -05:00
parent f56c559c84
commit 70ae5daadb
10 changed files with 91 additions and 83 deletions
@@ -1,4 +1,5 @@
 # ---> Python
+uv.lock
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
@@ -7,35 +7,28 @@ name = "toldg"
 version = "0.1.0"
 description = "Tool to generate ledger files from csv"
 readme = "README.md"
-requires-python = ">=3.12,<4.0"
+requires-python = ">=3.13,<4.0"
 license = {text = "MIT"}
 authors = [
    {name = "Felix Martin", email = "mail@felixm.de"}
 ]
 dependencies = [
-    "fava (>=1.30.1,<2.0.0)",
-    "pydantic (>=2.10.6,<3.0.0)",
-    "beancount (>=3.1.0,<4.0.0)",
-    "rich (>=13.9.4,<14.0.0)",
-    "numpy (>=2.2.3,<3.0.0)"
+    "fava",
+    "pydantic",
+    "beancount",
+    "rich",
+    "numpy",
+    "ty",
+    "ruff",
 ]

-[tool.poetry.group.dev.dependencies]
-pre-commit = "^4.1.0"
-black = "^25.1.0"
-isort = "^6.0.1"
-pytest = "^8.3.4"
-
 [project.scripts]
 toldg = "toldg.__main__:main"

 [tool.setuptools]
 package-dir = {"" = "src"}

-[tool.black]
-line-length = 88
-target-version = ["py312"]
+[tool.ruff]
+target-version = "py313" 
+line-length = 100

-[tool.isort]
-profile = "black"
-line_length = 88
@@ -7,7 +7,8 @@ from rich.logging import RichHandler

 from toldg.process import process_csv_files, process_ldg_files
 from toldg.train import train
-from toldg.utils import load_config, remove_if_exists, write_meta
+from toldg.utils import load_config, remove_if_exists
+from toldg.models import Config


 def init_logging():
@@ -19,10 +20,28 @@ def init_logging():
    )


+def get_new_entries(entries: list, csv_entries: list) -> list:
+    key_to_entry = {
+        (entry.meta["source_file"], entry.meta["source_index"]): entry for entry in entries
+    }
+    assert len(entries) == len(key_to_entry), "Transaction keys must be unique"

-def load_and_write_back(filename):
-    entries, errors, options_map = beancount.loader.load_file(filename)
+    new_entries = []
+    for csv_entry in csv_entries:
+        key = (csv_entry.csv_file, csv_entry.index)
+        if key in key_to_entry:
+            existing_entry = key_to_entry[key]
+            if existing_entry.meta["source_row"] != csv_entry.row:
+                msg = f"Consistency error: CSV entry {csv_entry} is different to {existing_entry}"
+                logging.error(msg)
+                raise SystemExit(1)
+        else:
+            new_entries.append(csv_entry)
+    logging.info(f"Got {len(new_entries)} new and {len(entries)} existing transactions.")
+    return new_entries

+
+def update_ledger(config: Config):
    def beancount_entry_to_string(entry) -> str:
        buf = io.StringIO()
        beancount.parser.printer.print_entry(entry, file=buf)
@@ -31,15 +50,35 @@ def load_and_write_back(filename):
    def is_transaction(entry) -> bool:
        return isinstance(entry, beancount.core.data.Transaction)

-    prev_entry_was_transaction = False
+    filename = config.output_file
+    entries, errors, options_map = beancount.loader.load_file(filename)
+
    if errors:
-        print(f"errors in generated '{filename}'")
+        logging.error(f"errors in '{filename}'")
        for err in errors:
-            print(err)
-    else:
+            logging.error(err)
+        raise SystemExit(1)
+
    entries.sort(key=lambda e: e.date)
-        with open(filename, "w") as f:
-            f.write('option "operating_currency" "USD"\n')
+
+    # Note(felixm): Only write back transactions from the main beancount file.
+    # The issue is that `beancount.loader.load_file` does not allow for a full
+    # round trip; some of the entries get swallowed. Therefore, treat all files
+    # that are not the main beancount file as input only files. This means
+    # these input only files can only be edited by hand, but the user can use
+    # them to set options for beancount and fava, and add other types of
+    # entries that would otherwise disappear after the round trip. I have seen
+    # tickets on GitHub about changing this API so that everything can be
+    # written back as is, but until then, this works well for my use-case.
+    entries = [e for e in entries if e.meta["filename"] == str(filename.absolute())]
+
+    csv_entries = process_csv_files(config, False)
+    entries += get_new_entries(entries, csv_entries)
+    remove_if_exists(config.output_file)
+    process_ldg_files(config)
+
+    with open(filename, "a") as f:
+        prev_entry_was_transaction = False
        for entry in entries:
            if prev_entry_was_transaction:
                f.write("\n")
@@ -48,6 +87,7 @@ def load_and_write_back(filename):
            f.write(beancount_entry_to_string(entry))
            f.write("\n")
            prev_entry_was_transaction = is_transaction(entry)
+    logging.info(f"Ledger file '{filename}' was written successfully.")


 def main():
@@ -56,11 +96,7 @@ def main():
    if len(sys.argv) > 2 and sys.argv[2] == "train":
        train(config)
    else:
-        remove_if_exists(config.output_file)
-        write_meta(config)
-        process_csv_files(config)
-        load_and_write_back(config.output_file)
-        process_ldg_files(config)
+        update_ledger(config)


 if __name__ == "__main__":
@@ -8,9 +8,7 @@ EXECUTABLE_NAME = "fzf.exe" if sys.platform == "win32" else "fzf"
 def iterfzf(iterable, prompt="> "):
    cmd = [EXECUTABLE_NAME, "--prompt=" + prompt]
    encoding = sys.getdefaultencoding()
-    proc = subprocess.Popen(
-        cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=None
-    )
+    proc = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=None)
    if proc.stdin is None:
        return None
    try:
@@ -24,7 +22,10 @@ def iterfzf(iterable, prompt="> "):
        return None
    if proc.stdout is None:
        return None
-    decode = lambda t: t.decode(encoding)
+
+    def decode(t):
+        return t.decode(encoding)
+
    output = [decode(ln.strip(b"\r\n\0")) for ln in iter(proc.stdout.readline, b"")]
    try:
        return output[0]
@@ -86,4 +86,3 @@ class Transaction(BaseModel):

    def key(self):
        return self.csv_file + ", " + self.row
-
@@ -1,11 +1,10 @@
-import json
 import logging
 import os
 import pickle
 import re
-from collections import Counter, defaultdict
+from collections import Counter
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Set, Tuple
+from typing import Dict, List, Optional, Tuple

 import numpy as np

@@ -31,9 +30,7 @@ class Tokenizer:
            word_counts.update(tokens)

        # Filter words by minimum count
-        filtered_words = [
-            word for word, count in word_counts.items() if count >= self.min_count
-        ]
+        filtered_words = [word for word, count in word_counts.items() if count >= self.min_count]

        # Build vocabulary
        self.vocab = {word: idx for idx, word in enumerate(filtered_words)}
@@ -373,7 +370,6 @@ def get_sort_categories(model_path: Path):
        _classifier.sort_categories(row, categories)

    try:
-
        model_path = Path("transaction_classifier.pkl")
        _classifier = TransactionClassifier(model_path)
        if _classifier.model is None:
@@ -385,9 +381,7 @@ def get_sort_categories(model_path: Path):
    return sort_categories


-def add_account2(
-    model_path: Path, transactions: List[Transaction], categories: List[str]
-):
+def add_account2(model_path: Path, transactions: List[Transaction], categories: List[str]):
    """Add account2 to unmapped transactions."""
    unmapped_transactions = list(filter(lambda t: t.mapping is None, transactions))
    if len(unmapped_transactions) == 0:
@@ -14,10 +14,9 @@ from toldg.models import Config, CsvConfig, Mapping, Transaction

 def process_ldg_files(config: Config):
    with open(config.output_file, "a") as f_out:
-        f_out.write("\n")
        for ldg_file in toldg.utils.get_ldg_files(config.input_directory):
            ldg_rel = os.path.relpath(ldg_file, os.path.dirname(config.output_file))
-            f_out.write(f"include \"{ldg_rel}\"\n")
+            f_out.write(f'include "{ldg_rel}"\n')


 def get_csv_config(csv_file: str, csv_configs: list[CsvConfig]) -> CsvConfig:
@@ -63,8 +62,7 @@ def get_transactions(csv_file: str, config: CsvConfig) -> list[Transaction]:
        for _ in range(config.skip):
            next(reader)
        rows = [row for row in reader if row]
-    transactions = [row_to_transaction(i, row, fields)
-                    for i, row in enumerate(reversed(rows))]
+    transactions = [row_to_transaction(i, row, fields) for i, row in enumerate(reversed(rows))]
    return transactions


@@ -74,9 +72,7 @@ def apply_mappings(transactions: list[Transaction], mappings: dict[str, Mapping]
        if t.key() in mappings:
            mapping = mappings[t.key()]
            assert isinstance(mapping, Mapping)
-            assert (
-                mapping.count > 0
-            ), f"{mapping} used by {t} but count is not greater than '0'."
+            assert mapping.count > 0, f"{mapping} used by {t} but count is not greater than '0'."
            mapping.count -= 1
            t.mapping = mapping
        else:
@@ -86,7 +82,7 @@ def apply_mappings(transactions: list[Transaction], mappings: dict[str, Mapping]
        assert mapping.count == 0, f"{mapping} was not used as often as expected!"


-def process_csv_files(config: Config) -> list[Transaction]:
+def process_csv_files(config: Config, write_outputs: bool = True) -> list[Transaction]:
    csv_files = toldg.utils.get_csv_files(config.input_directory)
    transactions = []
    for csv_file in csv_files:
@@ -97,6 +93,7 @@ def process_csv_files(config: Config) -> list[Transaction]:
    mappings = toldg.utils.read_mappings(config.mappings_file)
    apply_mappings(transactions, mappings)
    toldg.predict.add_account2(config.model, transactions, config.categories)
+    if write_outputs:
        toldg.utils.write_mappings(transactions, config.mappings_file)
        toldg.write.render_to_file(transactions, config)
    return transactions
@@ -1,5 +1,4 @@
 import logging
-from pathlib import Path

 from toldg.models import Config
 from toldg.predict import train_classifier
@@ -3,7 +3,7 @@ import logging
 import os
 import sys
 from pathlib import Path
-from typing import Any, Dict, List, Optional
+from typing import Dict, List

 from pydantic import ValidationError

@@ -57,14 +57,6 @@ def category_to_bean(c: str) -> str:
    return ":".join(new_sections)


-def write_meta(config: Config):
-    with open(config.output_file, "a") as f:
-        for category in config.categories:
-            f.write(f"2017-01-01 open {category_to_bean(category)}\n")
-        f.write("\n")
-        f.write('option "operating_currency" "USD"\n\n')
-
-
 def write_mappings(transactions: List[Transaction], mappings_file: Path):
    """Write transactions to the mappings file."""

@@ -74,10 +66,8 @@ def write_mappings(transactions: List[Transaction], mappings_file: Path):
            pass
        else:
            mapping = Mapping(
-                **{
-                    "account2": t.account2.strip(),
-                    "narration": t.description,
-                }
+                account2=t.account2.strip(),
+                narration=t.description,
            )
            mappings[t.key()] = mapping

@@ -1,4 +1,3 @@
-from pathlib import Path
 from typing import List

 from toldg.models import Config, Transaction
@@ -6,11 +5,11 @@ from toldg.utils import category_to_bean

 BEANCOUNT_TRANSACTION_TEMPLATE = """
 {t.date} * {description}{tags}
-    {account2:<40}  {t.debit:<6} {t.currency}
-    {account1:<40}  {t.credit:<6} {t.currency}
    source_file: "{t.csv_file}"
    source_index: {t.index}
    source_row: "{t.row}"
+    {account2:<40}  {t.debit:<6} {t.currency}
+    {account1:<40}  {t.credit:<6} {t.currency}
 """


@@ -58,4 +57,3 @@ def render_to_file(transactions: List[Transaction], config: Config):
    content = "".join(format(t) for t in transactions)
    with open(config.output_file, "a") as f:
        f.write(content)
-