diff --git a/.gitignore b/.gitignore index 28b315a..49c520a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ # ---> Python +uv.lock # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/pyproject.toml b/pyproject.toml index 0656932..fd34f8e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,35 +7,28 @@ name = "toldg" version = "0.1.0" description = "Tool to generate ledger files from csv" readme = "README.md" -requires-python = ">=3.12,<4.0" +requires-python = ">=3.13,<4.0" license = {text = "MIT"} authors = [ {name = "Felix Martin", email = "mail@felixm.de"} ] dependencies = [ - "fava (>=1.30.1,<2.0.0)", - "pydantic (>=2.10.6,<3.0.0)", - "beancount (>=3.1.0,<4.0.0)", - "rich (>=13.9.4,<14.0.0)", - "numpy (>=2.2.3,<3.0.0)" + "fava", + "pydantic", + "beancount", + "rich", + "numpy", + "ty", + "ruff", ] -[tool.poetry.group.dev.dependencies] -pre-commit = "^4.1.0" -black = "^25.1.0" -isort = "^6.0.1" -pytest = "^8.3.4" - [project.scripts] toldg = "toldg.__main__:main" [tool.setuptools] package-dir = {"" = "src"} -[tool.black] -line-length = 88 -target-version = ["py312"] +[tool.ruff] +target-version = "py313" +line-length = 100 -[tool.isort] -profile = "black" -line_length = 88 diff --git a/src/toldg/__main__.py b/src/toldg/__main__.py index a8bc231..9891611 100644 --- a/src/toldg/__main__.py +++ b/src/toldg/__main__.py @@ -7,7 +7,8 @@ from rich.logging import RichHandler from toldg.process import process_csv_files, process_ldg_files from toldg.train import train -from toldg.utils import load_config, remove_if_exists, write_meta +from toldg.utils import load_config, remove_if_exists +from toldg.models import Config def init_logging(): @@ -19,10 +20,28 @@ def init_logging(): ) +def get_new_entries(entries: list, csv_entries: list) -> list: + key_to_entry = { + (entry.meta["source_file"], entry.meta["source_index"]): entry for entry in entries + } + assert len(entries) == len(key_to_entry), "Transaction keys must be unique" -def load_and_write_back(filename): - entries, errors, options_map = beancount.loader.load_file(filename) + new_entries = [] + for csv_entry in csv_entries: + key = (csv_entry.csv_file, csv_entry.index) + if key in key_to_entry: + existing_entry = key_to_entry[key] + if existing_entry.meta["source_row"] != csv_entry.row: + msg = f"Consistency error: CSV entry {csv_entry} is different to {existing_entry}" + logging.error(msg) + raise SystemExit(1) + else: + new_entries.append(csv_entry) + logging.info(f"Got {len(new_entries)} new and {len(entries)} existing transactions.") + return new_entries + +def update_ledger(config: Config): def beancount_entry_to_string(entry) -> str: buf = io.StringIO() beancount.parser.printer.print_entry(entry, file=buf) @@ -31,23 +50,44 @@ def load_and_write_back(filename): def is_transaction(entry) -> bool: return isinstance(entry, beancount.core.data.Transaction) - prev_entry_was_transaction = False + filename = config.output_file + entries, errors, options_map = beancount.loader.load_file(filename) + if errors: - print(f"errors in generated '{filename}'") + logging.error(f"errors in '{filename}'") for err in errors: - print(err) - else: - entries.sort(key=lambda e: e.date) - with open(filename, "w") as f: - f.write('option "operating_currency" "USD"\n') - for entry in entries: - if prev_entry_was_transaction: - f.write("\n") - elif not prev_entry_was_transaction and is_transaction(entry): - f.write("\n") - f.write(beancount_entry_to_string(entry)) + logging.error(err) + raise SystemExit(1) + + entries.sort(key=lambda e: e.date) + + # Note(felixm): Only write back transactions from the main beancount file. + # The issue is that `beancount.loader.load_file` does not allow for a full + # round trip; some of the entries get swallowed. Therefore, treat all files + # that are not the main beancount file as input only files. This means + # these input only files can only be edited by hand, but the user can use + # them to set options for beancount and fava, and add other types of + # entries that would otherwise disappear after the round trip. I have seen + # tickets on GitHub about changing this API so that everything can be + # written back as is, but until then, this works well for my use-case. + entries = [e for e in entries if e.meta["filename"] == str(filename.absolute())] + + csv_entries = process_csv_files(config, False) + entries += get_new_entries(entries, csv_entries) + remove_if_exists(config.output_file) + process_ldg_files(config) + + with open(filename, "a") as f: + prev_entry_was_transaction = False + for entry in entries: + if prev_entry_was_transaction: f.write("\n") - prev_entry_was_transaction = is_transaction(entry) + elif not prev_entry_was_transaction and is_transaction(entry): + f.write("\n") + f.write(beancount_entry_to_string(entry)) + f.write("\n") + prev_entry_was_transaction = is_transaction(entry) + logging.info(f"Ledger file '{filename}' was written successfully.") def main(): @@ -56,11 +96,7 @@ def main(): if len(sys.argv) > 2 and sys.argv[2] == "train": train(config) else: - remove_if_exists(config.output_file) - write_meta(config) - process_csv_files(config) - load_and_write_back(config.output_file) - process_ldg_files(config) + update_ledger(config) if __name__ == "__main__": diff --git a/src/toldg/fzf.py b/src/toldg/fzf.py index 6f36f50..554cfec 100644 --- a/src/toldg/fzf.py +++ b/src/toldg/fzf.py @@ -8,9 +8,7 @@ EXECUTABLE_NAME = "fzf.exe" if sys.platform == "win32" else "fzf" def iterfzf(iterable, prompt="> "): cmd = [EXECUTABLE_NAME, "--prompt=" + prompt] encoding = sys.getdefaultencoding() - proc = subprocess.Popen( - cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=None - ) + proc = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=None) if proc.stdin is None: return None try: @@ -24,7 +22,10 @@ def iterfzf(iterable, prompt="> "): return None if proc.stdout is None: return None - decode = lambda t: t.decode(encoding) + + def decode(t): + return t.decode(encoding) + output = [decode(ln.strip(b"\r\n\0")) for ln in iter(proc.stdout.readline, b"")] try: return output[0] diff --git a/src/toldg/models.py b/src/toldg/models.py index 377a712..c0646b1 100644 --- a/src/toldg/models.py +++ b/src/toldg/models.py @@ -86,4 +86,3 @@ class Transaction(BaseModel): def key(self): return self.csv_file + ", " + self.row - diff --git a/src/toldg/predict.py b/src/toldg/predict.py index a82f1ca..e1bbe34 100644 --- a/src/toldg/predict.py +++ b/src/toldg/predict.py @@ -1,11 +1,10 @@ -import json import logging import os import pickle import re -from collections import Counter, defaultdict +from collections import Counter from pathlib import Path -from typing import Any, Dict, List, Optional, Set, Tuple +from typing import Dict, List, Optional, Tuple import numpy as np @@ -31,9 +30,7 @@ class Tokenizer: word_counts.update(tokens) # Filter words by minimum count - filtered_words = [ - word for word, count in word_counts.items() if count >= self.min_count - ] + filtered_words = [word for word, count in word_counts.items() if count >= self.min_count] # Build vocabulary self.vocab = {word: idx for idx, word in enumerate(filtered_words)} @@ -373,7 +370,6 @@ def get_sort_categories(model_path: Path): _classifier.sort_categories(row, categories) try: - model_path = Path("transaction_classifier.pkl") _classifier = TransactionClassifier(model_path) if _classifier.model is None: @@ -385,9 +381,7 @@ def get_sort_categories(model_path: Path): return sort_categories -def add_account2( - model_path: Path, transactions: List[Transaction], categories: List[str] -): +def add_account2(model_path: Path, transactions: List[Transaction], categories: List[str]): """Add account2 to unmapped transactions.""" unmapped_transactions = list(filter(lambda t: t.mapping is None, transactions)) if len(unmapped_transactions) == 0: diff --git a/src/toldg/process.py b/src/toldg/process.py index ea17bdf..0f39397 100644 --- a/src/toldg/process.py +++ b/src/toldg/process.py @@ -14,10 +14,9 @@ from toldg.models import Config, CsvConfig, Mapping, Transaction def process_ldg_files(config: Config): with open(config.output_file, "a") as f_out: - f_out.write("\n") for ldg_file in toldg.utils.get_ldg_files(config.input_directory): ldg_rel = os.path.relpath(ldg_file, os.path.dirname(config.output_file)) - f_out.write(f"include \"{ldg_rel}\"\n") + f_out.write(f'include "{ldg_rel}"\n') def get_csv_config(csv_file: str, csv_configs: list[CsvConfig]) -> CsvConfig: @@ -63,8 +62,7 @@ def get_transactions(csv_file: str, config: CsvConfig) -> list[Transaction]: for _ in range(config.skip): next(reader) rows = [row for row in reader if row] - transactions = [row_to_transaction(i, row, fields) - for i, row in enumerate(reversed(rows))] + transactions = [row_to_transaction(i, row, fields) for i, row in enumerate(reversed(rows))] return transactions @@ -74,9 +72,7 @@ def apply_mappings(transactions: list[Transaction], mappings: dict[str, Mapping] if t.key() in mappings: mapping = mappings[t.key()] assert isinstance(mapping, Mapping) - assert ( - mapping.count > 0 - ), f"{mapping} used by {t} but count is not greater than '0'." + assert mapping.count > 0, f"{mapping} used by {t} but count is not greater than '0'." mapping.count -= 1 t.mapping = mapping else: @@ -86,7 +82,7 @@ def apply_mappings(transactions: list[Transaction], mappings: dict[str, Mapping] assert mapping.count == 0, f"{mapping} was not used as often as expected!" -def process_csv_files(config: Config) -> list[Transaction]: +def process_csv_files(config: Config, write_outputs: bool = True) -> list[Transaction]: csv_files = toldg.utils.get_csv_files(config.input_directory) transactions = [] for csv_file in csv_files: @@ -97,6 +93,7 @@ def process_csv_files(config: Config) -> list[Transaction]: mappings = toldg.utils.read_mappings(config.mappings_file) apply_mappings(transactions, mappings) toldg.predict.add_account2(config.model, transactions, config.categories) - toldg.utils.write_mappings(transactions, config.mappings_file) - toldg.write.render_to_file(transactions, config) + if write_outputs: + toldg.utils.write_mappings(transactions, config.mappings_file) + toldg.write.render_to_file(transactions, config) return transactions diff --git a/src/toldg/train.py b/src/toldg/train.py index 6f28f28..761f6e5 100644 --- a/src/toldg/train.py +++ b/src/toldg/train.py @@ -1,5 +1,4 @@ import logging -from pathlib import Path from toldg.models import Config from toldg.predict import train_classifier diff --git a/src/toldg/utils.py b/src/toldg/utils.py index 105c454..f651e9b 100644 --- a/src/toldg/utils.py +++ b/src/toldg/utils.py @@ -3,7 +3,7 @@ import logging import os import sys from pathlib import Path -from typing import Any, Dict, List, Optional +from typing import Dict, List from pydantic import ValidationError @@ -57,14 +57,6 @@ def category_to_bean(c: str) -> str: return ":".join(new_sections) -def write_meta(config: Config): - with open(config.output_file, "a") as f: - for category in config.categories: - f.write(f"2017-01-01 open {category_to_bean(category)}\n") - f.write("\n") - f.write('option "operating_currency" "USD"\n\n') - - def write_mappings(transactions: List[Transaction], mappings_file: Path): """Write transactions to the mappings file.""" @@ -74,10 +66,8 @@ def write_mappings(transactions: List[Transaction], mappings_file: Path): pass else: mapping = Mapping( - **{ - "account2": t.account2.strip(), - "narration": t.description, - } + account2=t.account2.strip(), + narration=t.description, ) mappings[t.key()] = mapping diff --git a/src/toldg/write.py b/src/toldg/write.py index eeeb293..d0518ef 100644 --- a/src/toldg/write.py +++ b/src/toldg/write.py @@ -1,4 +1,3 @@ -from pathlib import Path from typing import List from toldg.models import Config, Transaction @@ -6,11 +5,11 @@ from toldg.utils import category_to_bean BEANCOUNT_TRANSACTION_TEMPLATE = """ {t.date} * {description}{tags} - {account2:<40} {t.debit:<6} {t.currency} - {account1:<40} {t.credit:<6} {t.currency} source_file: "{t.csv_file}" source_index: {t.index} source_row: "{t.row}" + {account2:<40} {t.debit:<6} {t.currency} + {account1:<40} {t.credit:<6} {t.currency} """ @@ -58,4 +57,3 @@ def render_to_file(transactions: List[Transaction], config: Config): content = "".join(format(t) for t in transactions) with open(config.output_file, "a") as f: f.write(content) -