Update ledgerai to read existing transactions from beancount file

2025-12-20 15:33:08 -05:00
parent f56c559c84
commit 70ae5daadb
10 changed files with 91 additions and 83 deletions
@@ -1,4 +1,5 @@
 # ---> Python
 uv.lock
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
@@ -7,35 +7,28 @@ name = "toldg"
 version = "0.1.0"
 description = "Tool to generate ledger files from csv"
 readme = "README.md"
-requires-python = ">=3.12,<4.0"
+requires-python = ">=3.13,<4.0"
 license = {text = "MIT"}
 authors = [
    {name = "Felix Martin", email = "mail@felixm.de"}
 ]
 dependencies = [
-    "fava (>=1.30.1,<2.0.0)",
+    "fava",
-    "pydantic (>=2.10.6,<3.0.0)",
+    "pydantic",
-    "beancount (>=3.1.0,<4.0.0)",
+    "beancount",
-    "rich (>=13.9.4,<14.0.0)",
+    "rich",
-    "numpy (>=2.2.3,<3.0.0)"
+    "numpy",
    "ty",
    "ruff",
 ]
 [tool.poetry.group.dev.dependencies]
 pre-commit = "^4.1.0"
 black = "^25.1.0"
 isort = "^6.0.1"
 pytest = "^8.3.4"
 [project.scripts]
 toldg = "toldg.__main__:main"
 [tool.setuptools]
 package-dir = {"" = "src"}
-[tool.black]
+[tool.ruff]
-line-length = 88
+target-version = "py313" 
-target-version = ["py312"]
+line-length = 100
 [tool.isort]
 profile = "black"
 line_length = 88
@@ -7,7 +7,8 @@ from rich.logging import RichHandler
 from toldg.process import process_csv_files, process_ldg_files
 from toldg.train import train
-from toldg.utils import load_config, remove_if_exists, write_meta
+from toldg.utils import load_config, remove_if_exists
 from toldg.models import Config
 def init_logging():
@@ -19,10 +20,28 @@ def init_logging():
    )
 def get_new_entries(entries: list, csv_entries: list) -> list:
    key_to_entry = {
        (entry.meta["source_file"], entry.meta["source_index"]): entry for entry in entries
    }
    assert len(entries) == len(key_to_entry), "Transaction keys must be unique"
-def load_and_write_back(filename):
+    new_entries = []
-    entries, errors, options_map = beancount.loader.load_file(filename)
+    for csv_entry in csv_entries:
        key = (csv_entry.csv_file, csv_entry.index)
        if key in key_to_entry:
            existing_entry = key_to_entry[key]
            if existing_entry.meta["source_row"] != csv_entry.row:
                msg = f"Consistency error: CSV entry {csv_entry} is different to {existing_entry}"
                logging.error(msg)
                raise SystemExit(1)
        else:
            new_entries.append(csv_entry)
    logging.info(f"Got {len(new_entries)} new and {len(entries)} existing transactions.")
    return new_entries
 def update_ledger(config: Config):
    def beancount_entry_to_string(entry) -> str:
        buf = io.StringIO()
        beancount.parser.printer.print_entry(entry, file=buf)
@@ -31,23 +50,44 @@ def load_and_write_back(filename):
    def is_transaction(entry) -> bool:
        return isinstance(entry, beancount.core.data.Transaction)
-    prev_entry_was_transaction = False
+    filename = config.output_file
    entries, errors, options_map = beancount.loader.load_file(filename)
    if errors:
-        print(f"errors in generated '{filename}'")
+        logging.error(f"errors in '{filename}'")
        for err in errors:
-            print(err)
+            logging.error(err)
-    else:
+        raise SystemExit(1)
-        entries.sort(key=lambda e: e.date)
+
-        with open(filename, "w") as f:
+    entries.sort(key=lambda e: e.date)
-            f.write('option "operating_currency" "USD"\n')
+
-            for entry in entries:
+    # Note(felixm): Only write back transactions from the main beancount file.
-                if prev_entry_was_transaction:
+    # The issue is that `beancount.loader.load_file` does not allow for a full
-                    f.write("\n")
+    # round trip; some of the entries get swallowed. Therefore, treat all files
-                elif not prev_entry_was_transaction and is_transaction(entry):
+    # that are not the main beancount file as input only files. This means
-                    f.write("\n")
+    # these input only files can only be edited by hand, but the user can use
-                f.write(beancount_entry_to_string(entry))
+    # them to set options for beancount and fava, and add other types of
    # entries that would otherwise disappear after the round trip. I have seen
    # tickets on GitHub about changing this API so that everything can be
    # written back as is, but until then, this works well for my use-case.
    entries = [e for e in entries if e.meta["filename"] == str(filename.absolute())]
    csv_entries = process_csv_files(config, False)
    entries += get_new_entries(entries, csv_entries)
    remove_if_exists(config.output_file)
    process_ldg_files(config)
    with open(filename, "a") as f:
        prev_entry_was_transaction = False
        for entry in entries:
            if prev_entry_was_transaction:
                f.write("\n")
-                prev_entry_was_transaction = is_transaction(entry)
+            elif not prev_entry_was_transaction and is_transaction(entry):
                f.write("\n")
            f.write(beancount_entry_to_string(entry))
            f.write("\n")
            prev_entry_was_transaction = is_transaction(entry)
    logging.info(f"Ledger file '{filename}' was written successfully.")
 def main():
@@ -56,11 +96,7 @@ def main():
    if len(sys.argv) > 2 and sys.argv[2] == "train":
        train(config)
    else:
-        remove_if_exists(config.output_file)
+        update_ledger(config)
        write_meta(config)
        process_csv_files(config)
        load_and_write_back(config.output_file)
        process_ldg_files(config)
 if __name__ == "__main__":
@@ -8,9 +8,7 @@ EXECUTABLE_NAME = "fzf.exe" if sys.platform == "win32" else "fzf"
 def iterfzf(iterable, prompt="> "):
    cmd = [EXECUTABLE_NAME, "--prompt=" + prompt]
    encoding = sys.getdefaultencoding()
-    proc = subprocess.Popen(
+    proc = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=None)
        cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=None
    )
    if proc.stdin is None:
        return None
    try:
@@ -24,7 +22,10 @@ def iterfzf(iterable, prompt="> "):
        return None
    if proc.stdout is None:
        return None
-    decode = lambda t: t.decode(encoding)
+
    def decode(t):
        return t.decode(encoding)
    output = [decode(ln.strip(b"\r\n\0")) for ln in iter(proc.stdout.readline, b"")]
    try:
        return output[0]
@@ -86,4 +86,3 @@ class Transaction(BaseModel):
    def key(self):
        return self.csv_file + ", " + self.row
@@ -1,11 +1,10 @@
 import json
 import logging
 import os
 import pickle
 import re
-from collections import Counter, defaultdict
+from collections import Counter
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Set, Tuple
+from typing import Dict, List, Optional, Tuple
 import numpy as np
@@ -31,9 +30,7 @@ class Tokenizer:
            word_counts.update(tokens)
        # Filter words by minimum count
-        filtered_words = [
+        filtered_words = [word for word, count in word_counts.items() if count >= self.min_count]
            word for word, count in word_counts.items() if count >= self.min_count
        ]
        # Build vocabulary
        self.vocab = {word: idx for idx, word in enumerate(filtered_words)}
@@ -373,7 +370,6 @@ def get_sort_categories(model_path: Path):
        _classifier.sort_categories(row, categories)
    try:
        model_path = Path("transaction_classifier.pkl")
        _classifier = TransactionClassifier(model_path)
        if _classifier.model is None:
@@ -385,9 +381,7 @@ def get_sort_categories(model_path: Path):
    return sort_categories
-def add_account2(
+def add_account2(model_path: Path, transactions: List[Transaction], categories: List[str]):
    model_path: Path, transactions: List[Transaction], categories: List[str]
 ):
    """Add account2 to unmapped transactions."""
    unmapped_transactions = list(filter(lambda t: t.mapping is None, transactions))
    if len(unmapped_transactions) == 0:
@@ -14,10 +14,9 @@ from toldg.models import Config, CsvConfig, Mapping, Transaction
 def process_ldg_files(config: Config):
    with open(config.output_file, "a") as f_out:
        f_out.write("\n")
        for ldg_file in toldg.utils.get_ldg_files(config.input_directory):
            ldg_rel = os.path.relpath(ldg_file, os.path.dirname(config.output_file))
-            f_out.write(f"include \"{ldg_rel}\"\n")
+            f_out.write(f'include "{ldg_rel}"\n')
 def get_csv_config(csv_file: str, csv_configs: list[CsvConfig]) -> CsvConfig:
@@ -63,8 +62,7 @@ def get_transactions(csv_file: str, config: CsvConfig) -> list[Transaction]:
        for _ in range(config.skip):
            next(reader)
        rows = [row for row in reader if row]
-    transactions = [row_to_transaction(i, row, fields)
+    transactions = [row_to_transaction(i, row, fields) for i, row in enumerate(reversed(rows))]
                    for i, row in enumerate(reversed(rows))]
    return transactions
@@ -74,9 +72,7 @@ def apply_mappings(transactions: list[Transaction], mappings: dict[str, Mapping]
        if t.key() in mappings:
            mapping = mappings[t.key()]
            assert isinstance(mapping, Mapping)
-            assert (
+            assert mapping.count > 0, f"{mapping} used by {t} but count is not greater than '0'."
                mapping.count > 0
            ), f"{mapping} used by {t} but count is not greater than '0'."
            mapping.count -= 1
            t.mapping = mapping
        else:
@@ -86,7 +82,7 @@ def apply_mappings(transactions: list[Transaction], mappings: dict[str, Mapping]
        assert mapping.count == 0, f"{mapping} was not used as often as expected!"
-def process_csv_files(config: Config) -> list[Transaction]:
+def process_csv_files(config: Config, write_outputs: bool = True) -> list[Transaction]:
    csv_files = toldg.utils.get_csv_files(config.input_directory)
    transactions = []
    for csv_file in csv_files:
@@ -97,6 +93,7 @@ def process_csv_files(config: Config) -> list[Transaction]:
    mappings = toldg.utils.read_mappings(config.mappings_file)
    apply_mappings(transactions, mappings)
    toldg.predict.add_account2(config.model, transactions, config.categories)
-    toldg.utils.write_mappings(transactions, config.mappings_file)
+    if write_outputs:
-    toldg.write.render_to_file(transactions, config)
+        toldg.utils.write_mappings(transactions, config.mappings_file)
        toldg.write.render_to_file(transactions, config)
    return transactions
@@ -1,5 +1,4 @@
 import logging
 from pathlib import Path
 from toldg.models import Config
 from toldg.predict import train_classifier
@@ -3,7 +3,7 @@ import logging
 import os
 import sys
 from pathlib import Path
-from typing import Any, Dict, List, Optional
+from typing import Dict, List
 from pydantic import ValidationError
@@ -57,14 +57,6 @@ def category_to_bean(c: str) -> str:
    return ":".join(new_sections)
 def write_meta(config: Config):
    with open(config.output_file, "a") as f:
        for category in config.categories:
            f.write(f"2017-01-01 open {category_to_bean(category)}\n")
        f.write("\n")
        f.write('option "operating_currency" "USD"\n\n')
 def write_mappings(transactions: List[Transaction], mappings_file: Path):
    """Write transactions to the mappings file."""
@@ -74,10 +66,8 @@ def write_mappings(transactions: List[Transaction], mappings_file: Path):
            pass
        else:
            mapping = Mapping(
-                **{
+                account2=t.account2.strip(),
-                    "account2": t.account2.strip(),
+                narration=t.description,
                    "narration": t.description,
                }
            )
            mappings[t.key()] = mapping
@@ -1,4 +1,3 @@
 from pathlib import Path
 from typing import List
 from toldg.models import Config, Transaction
@@ -6,11 +5,11 @@ from toldg.utils import category_to_bean
 BEANCOUNT_TRANSACTION_TEMPLATE = """
 {t.date} * {description}{tags}
    {account2:<40}  {t.debit:<6} {t.currency}
    {account1:<40}  {t.credit:<6} {t.currency}
    source_file: "{t.csv_file}"
    source_index: {t.index}
    source_row: "{t.row}"
    {account2:<40}  {t.debit:<6} {t.currency}
    {account1:<40}  {t.credit:<6} {t.currency}
 """
@@ -58,4 +57,3 @@ def render_to_file(transactions: List[Transaction], config: Config):
    content = "".join(format(t) for t in transactions)
    with open(config.output_file, "a") as f:
        f.write(content)
`@@ -86,4 +86,3 @@ class Transaction(BaseModel):`

	`def key(self):`	`def key(self):`
	`return self.csv_file + ", " + self.row`	`return self.csv_file + ", " + self.row`