Update ledgerai to read existing transactions from beancount file

This commit is contained in:
2025-12-20 15:33:08 -05:00
parent f56c559c84
commit 70ae5daadb
10 changed files with 91 additions and 83 deletions

1
.gitignore vendored
View File

@@ -1,4 +1,5 @@
# ---> Python
uv.lock
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]

View File

@@ -7,35 +7,28 @@ name = "toldg"
version = "0.1.0"
description = "Tool to generate ledger files from csv"
readme = "README.md"
requires-python = ">=3.12,<4.0"
requires-python = ">=3.13,<4.0"
license = {text = "MIT"}
authors = [
{name = "Felix Martin", email = "mail@felixm.de"}
]
dependencies = [
"fava (>=1.30.1,<2.0.0)",
"pydantic (>=2.10.6,<3.0.0)",
"beancount (>=3.1.0,<4.0.0)",
"rich (>=13.9.4,<14.0.0)",
"numpy (>=2.2.3,<3.0.0)"
"fava",
"pydantic",
"beancount",
"rich",
"numpy",
"ty",
"ruff",
]
[tool.poetry.group.dev.dependencies]
pre-commit = "^4.1.0"
black = "^25.1.0"
isort = "^6.0.1"
pytest = "^8.3.4"
[project.scripts]
toldg = "toldg.__main__:main"
[tool.setuptools]
package-dir = {"" = "src"}
[tool.black]
line-length = 88
target-version = ["py312"]
[tool.ruff]
target-version = "py313"
line-length = 100
[tool.isort]
profile = "black"
line_length = 88

View File

@@ -7,7 +7,8 @@ from rich.logging import RichHandler
from toldg.process import process_csv_files, process_ldg_files
from toldg.train import train
from toldg.utils import load_config, remove_if_exists, write_meta
from toldg.utils import load_config, remove_if_exists
from toldg.models import Config
def init_logging():
@@ -19,10 +20,28 @@ def init_logging():
)
def get_new_entries(entries: list, csv_entries: list) -> list:
key_to_entry = {
(entry.meta["source_file"], entry.meta["source_index"]): entry for entry in entries
}
assert len(entries) == len(key_to_entry), "Transaction keys must be unique"
def load_and_write_back(filename):
entries, errors, options_map = beancount.loader.load_file(filename)
new_entries = []
for csv_entry in csv_entries:
key = (csv_entry.csv_file, csv_entry.index)
if key in key_to_entry:
existing_entry = key_to_entry[key]
if existing_entry.meta["source_row"] != csv_entry.row:
msg = f"Consistency error: CSV entry {csv_entry} is different to {existing_entry}"
logging.error(msg)
raise SystemExit(1)
else:
new_entries.append(csv_entry)
logging.info(f"Got {len(new_entries)} new and {len(entries)} existing transactions.")
return new_entries
def update_ledger(config: Config):
def beancount_entry_to_string(entry) -> str:
buf = io.StringIO()
beancount.parser.printer.print_entry(entry, file=buf)
@@ -31,23 +50,44 @@ def load_and_write_back(filename):
def is_transaction(entry) -> bool:
return isinstance(entry, beancount.core.data.Transaction)
prev_entry_was_transaction = False
filename = config.output_file
entries, errors, options_map = beancount.loader.load_file(filename)
if errors:
print(f"errors in generated '{filename}'")
logging.error(f"errors in '{filename}'")
for err in errors:
print(err)
else:
entries.sort(key=lambda e: e.date)
with open(filename, "w") as f:
f.write('option "operating_currency" "USD"\n')
for entry in entries:
if prev_entry_was_transaction:
f.write("\n")
elif not prev_entry_was_transaction and is_transaction(entry):
f.write("\n")
f.write(beancount_entry_to_string(entry))
logging.error(err)
raise SystemExit(1)
entries.sort(key=lambda e: e.date)
# Note(felixm): Only write back transactions from the main beancount file.
# The issue is that `beancount.loader.load_file` does not allow for a full
# round trip; some of the entries get swallowed. Therefore, treat all files
# that are not the main beancount file as input only files. This means
# these input only files can only be edited by hand, but the user can use
# them to set options for beancount and fava, and add other types of
# entries that would otherwise disappear after the round trip. I have seen
# tickets on GitHub about changing this API so that everything can be
# written back as is, but until then, this works well for my use-case.
entries = [e for e in entries if e.meta["filename"] == str(filename.absolute())]
csv_entries = process_csv_files(config, False)
entries += get_new_entries(entries, csv_entries)
remove_if_exists(config.output_file)
process_ldg_files(config)
with open(filename, "a") as f:
prev_entry_was_transaction = False
for entry in entries:
if prev_entry_was_transaction:
f.write("\n")
prev_entry_was_transaction = is_transaction(entry)
elif not prev_entry_was_transaction and is_transaction(entry):
f.write("\n")
f.write(beancount_entry_to_string(entry))
f.write("\n")
prev_entry_was_transaction = is_transaction(entry)
logging.info(f"Ledger file '{filename}' was written successfully.")
def main():
@@ -56,11 +96,7 @@ def main():
if len(sys.argv) > 2 and sys.argv[2] == "train":
train(config)
else:
remove_if_exists(config.output_file)
write_meta(config)
process_csv_files(config)
load_and_write_back(config.output_file)
process_ldg_files(config)
update_ledger(config)
if __name__ == "__main__":

View File

@@ -8,9 +8,7 @@ EXECUTABLE_NAME = "fzf.exe" if sys.platform == "win32" else "fzf"
def iterfzf(iterable, prompt="> "):
cmd = [EXECUTABLE_NAME, "--prompt=" + prompt]
encoding = sys.getdefaultencoding()
proc = subprocess.Popen(
cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=None
)
proc = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=None)
if proc.stdin is None:
return None
try:
@@ -24,7 +22,10 @@ def iterfzf(iterable, prompt="> "):
return None
if proc.stdout is None:
return None
decode = lambda t: t.decode(encoding)
def decode(t):
return t.decode(encoding)
output = [decode(ln.strip(b"\r\n\0")) for ln in iter(proc.stdout.readline, b"")]
try:
return output[0]

View File

@@ -86,4 +86,3 @@ class Transaction(BaseModel):
def key(self):
return self.csv_file + ", " + self.row

View File

@@ -1,11 +1,10 @@
import json
import logging
import os
import pickle
import re
from collections import Counter, defaultdict
from collections import Counter
from pathlib import Path
from typing import Any, Dict, List, Optional, Set, Tuple
from typing import Dict, List, Optional, Tuple
import numpy as np
@@ -31,9 +30,7 @@ class Tokenizer:
word_counts.update(tokens)
# Filter words by minimum count
filtered_words = [
word for word, count in word_counts.items() if count >= self.min_count
]
filtered_words = [word for word, count in word_counts.items() if count >= self.min_count]
# Build vocabulary
self.vocab = {word: idx for idx, word in enumerate(filtered_words)}
@@ -373,7 +370,6 @@ def get_sort_categories(model_path: Path):
_classifier.sort_categories(row, categories)
try:
model_path = Path("transaction_classifier.pkl")
_classifier = TransactionClassifier(model_path)
if _classifier.model is None:
@@ -385,9 +381,7 @@ def get_sort_categories(model_path: Path):
return sort_categories
def add_account2(
model_path: Path, transactions: List[Transaction], categories: List[str]
):
def add_account2(model_path: Path, transactions: List[Transaction], categories: List[str]):
"""Add account2 to unmapped transactions."""
unmapped_transactions = list(filter(lambda t: t.mapping is None, transactions))
if len(unmapped_transactions) == 0:

View File

@@ -14,10 +14,9 @@ from toldg.models import Config, CsvConfig, Mapping, Transaction
def process_ldg_files(config: Config):
with open(config.output_file, "a") as f_out:
f_out.write("\n")
for ldg_file in toldg.utils.get_ldg_files(config.input_directory):
ldg_rel = os.path.relpath(ldg_file, os.path.dirname(config.output_file))
f_out.write(f"include \"{ldg_rel}\"\n")
f_out.write(f'include "{ldg_rel}"\n')
def get_csv_config(csv_file: str, csv_configs: list[CsvConfig]) -> CsvConfig:
@@ -63,8 +62,7 @@ def get_transactions(csv_file: str, config: CsvConfig) -> list[Transaction]:
for _ in range(config.skip):
next(reader)
rows = [row for row in reader if row]
transactions = [row_to_transaction(i, row, fields)
for i, row in enumerate(reversed(rows))]
transactions = [row_to_transaction(i, row, fields) for i, row in enumerate(reversed(rows))]
return transactions
@@ -74,9 +72,7 @@ def apply_mappings(transactions: list[Transaction], mappings: dict[str, Mapping]
if t.key() in mappings:
mapping = mappings[t.key()]
assert isinstance(mapping, Mapping)
assert (
mapping.count > 0
), f"{mapping} used by {t} but count is not greater than '0'."
assert mapping.count > 0, f"{mapping} used by {t} but count is not greater than '0'."
mapping.count -= 1
t.mapping = mapping
else:
@@ -86,7 +82,7 @@ def apply_mappings(transactions: list[Transaction], mappings: dict[str, Mapping]
assert mapping.count == 0, f"{mapping} was not used as often as expected!"
def process_csv_files(config: Config) -> list[Transaction]:
def process_csv_files(config: Config, write_outputs: bool = True) -> list[Transaction]:
csv_files = toldg.utils.get_csv_files(config.input_directory)
transactions = []
for csv_file in csv_files:
@@ -97,6 +93,7 @@ def process_csv_files(config: Config) -> list[Transaction]:
mappings = toldg.utils.read_mappings(config.mappings_file)
apply_mappings(transactions, mappings)
toldg.predict.add_account2(config.model, transactions, config.categories)
toldg.utils.write_mappings(transactions, config.mappings_file)
toldg.write.render_to_file(transactions, config)
if write_outputs:
toldg.utils.write_mappings(transactions, config.mappings_file)
toldg.write.render_to_file(transactions, config)
return transactions

View File

@@ -1,5 +1,4 @@
import logging
from pathlib import Path
from toldg.models import Config
from toldg.predict import train_classifier

View File

@@ -3,7 +3,7 @@ import logging
import os
import sys
from pathlib import Path
from typing import Any, Dict, List, Optional
from typing import Dict, List
from pydantic import ValidationError
@@ -57,14 +57,6 @@ def category_to_bean(c: str) -> str:
return ":".join(new_sections)
def write_meta(config: Config):
with open(config.output_file, "a") as f:
for category in config.categories:
f.write(f"2017-01-01 open {category_to_bean(category)}\n")
f.write("\n")
f.write('option "operating_currency" "USD"\n\n')
def write_mappings(transactions: List[Transaction], mappings_file: Path):
"""Write transactions to the mappings file."""
@@ -74,10 +66,8 @@ def write_mappings(transactions: List[Transaction], mappings_file: Path):
pass
else:
mapping = Mapping(
**{
"account2": t.account2.strip(),
"narration": t.description,
}
account2=t.account2.strip(),
narration=t.description,
)
mappings[t.key()] = mapping

View File

@@ -1,4 +1,3 @@
from pathlib import Path
from typing import List
from toldg.models import Config, Transaction
@@ -6,11 +5,11 @@ from toldg.utils import category_to_bean
BEANCOUNT_TRANSACTION_TEMPLATE = """
{t.date} * {description}{tags}
{account2:<40} {t.debit:<6} {t.currency}
{account1:<40} {t.credit:<6} {t.currency}
source_file: "{t.csv_file}"
source_index: {t.index}
source_row: "{t.row}"
{account2:<40} {t.debit:<6} {t.currency}
{account1:<40} {t.credit:<6} {t.currency}
"""
@@ -58,4 +57,3 @@ def render_to_file(transactions: List[Transaction], config: Config):
content = "".join(format(t) for t in transactions)
with open(config.output_file, "a") as f:
f.write(content)