Update project structure and move to beancount

2025-03-02 11:08:33 -05:00
parent 886bcdbdd1
commit 08c50e776e
17 changed files with 1844 additions and 296 deletions
@@ -0,0 +1 @@
+__version__ = "0.1.0"
@@ -0,0 +1,28 @@
+import logging
+
+from rich.logging import RichHandler
+
+from toldg.process import process_csv_files, process_ldg_files
+from toldg.utils import load_config, remove_if_exists, write_meta
+
+
+def init_logging():
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(message)s",
+        datefmt="[%X]",
+        handlers=[RichHandler()],
+    )
+
+
+def main():
+    init_logging()
+    config = load_config()
+    remove_if_exists(config.output_file)
+    write_meta(config)
+    process_ldg_files(config)
+    process_csv_files(config)
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,32 @@
+import errno
+import subprocess
+import sys
+
+EXECUTABLE_NAME = "fzf.exe" if sys.platform == "win32" else "fzf"
+
+
+def iterfzf(iterable, prompt="> "):
+    cmd = [EXECUTABLE_NAME, "--prompt=" + prompt]
+    encoding = sys.getdefaultencoding()
+    proc = subprocess.Popen(
+        cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=None
+    )
+    if proc.stdin is None:
+        return None
+    try:
+        lines = "\n".join(iterable)
+        proc.stdin.write(lines.encode("utf-8"))
+        proc.stdin.close()
+    except IOError as e:
+        if e.errno != errno.EPIPE and errno.EPIPE != 32:
+            raise
+    if proc is None or proc.wait() not in [0, 1]:
+        return None
+    if proc.stdout is None:
+        return None
+    decode = lambda t: t.decode(encoding)
+    output = [decode(ln.strip(b"\r\n\0")) for ln in iter(proc.stdout.readline, b"")]
+    try:
+        return output[0]
+    except IndexError:
+        return None
@@ -0,0 +1,76 @@
+from pathlib import Path
+from typing import List, Optional
+
+from pydantic import BaseModel
+
+UNKNOWN_CATEGORY = "account2"
+
+
+class CsvConfig(BaseModel):
+    """
+    Class to define how to parse a certain CSV file. We use the
+    file_match_regex attribute to decide whether to apply a config for a file.
+    If multiple configs match a single file we raise an exception.
+    """
+
+    class Config:
+        extra = "forbid"
+
+    account1: str
+    file_match_regex: str
+    fields: List[str]
+    input_date_format: str = "%m/%d/%Y"
+    output_date_format: str = "%Y/%m/%d"
+    skip: int = 1
+    delimiter: str = ","
+    quotechar: str = '"'
+    currency: str = "USD"
+
+
+class Config(BaseModel):
+    """
+    Configuration class for managing file search and data processing settings.
+
+    Attributes:
+        input_directory (Path):  Where to search for 'ldg' and 'csv' files.
+        mappings_file (Path):    The path to a 'json' file that contains account2 mappings.
+        output_file (Path):      Location to which to write the output 'ldg' file.
+        csv_configs:             List of CsvConfig which explains how to handle specific
+                                 CSV files.
+        categories (List[str]):  A list of account2s. An account has to be defined here
+                                 before it can be used in a mapping. Otherwise, ledger will complain.
+        commodities (List[str]): A list of commodities relevant to the data processing.
+        find_duplicates (bool):  Flag to check and abort on duplicated transactions. Not
+                                 really useful.
+    """
+
+    class Config:
+        extra = "forbid"
+
+    input_directory: Path
+    mappings_file: Path
+    descriptions_file: Optional[Path] = None
+    output_file: Path = Path("output.ldg")
+    csv_configs: List[CsvConfig]
+    categories: List[str]
+    commodities: List[str]
+    find_duplicates: bool = False
+
+
+class Transaction(BaseModel):
+    """
+    Class for ledger transaction to render into ldg file.
+    """
+
+    class Config:
+        extra = "forbid"
+
+    currency: str
+    debit: str
+    credit: str
+    date: str
+    account1: str
+    account2: str
+    description: str
+    csv_file: str
+    row: str
@@ -0,0 +1,49 @@
+from typing import List
+
+from toldg.fzf import iterfzf
+from toldg.models import UNKNOWN_CATEGORY, Transaction
+
+
+def get_sort_categories():
+    def sort_categories(row: str, categories: List[str]):
+        if learn is None:
+            return
+        _, _, probs = learn.predict(row)
+        cat_to_prob = dict(zip(learn.dls.vocab[1], probs.tolist()))
+        categories.sort(
+            key=lambda c: cat_to_prob[c] if c in cat_to_prob else 0.0, reverse=True
+        )
+
+    learn = None
+    try:
+        from fastai.text.all import load_learner
+
+        learn = load_learner("export.pkl")
+    except ModuleNotFoundError:
+        user_input = input("No fastai module. Type yes to continue anyway.")
+        if user_input.strip().lower() != "yes":
+            raise Exception("fastai module missing")
+
+    return sort_categories
+
+
+def add_account2(transactions: List[Transaction], categories: List[str]):
+    unmapped_transactions = list(
+        filter(lambda t: t.account2 == UNKNOWN_CATEGORY, transactions)
+    )
+    if len(unmapped_transactions) == 0:
+        return
+    sort_categories = get_sort_categories()
+    for t in unmapped_transactions:
+        sort_categories(t.row, categories)
+        add_account2_interactive(t, categories)
+
+
+def add_account2_interactive(transaction: Transaction, categories: List[str]):
+    t = transaction
+    account2 = None
+    prompt = f"{t.account1} {t.date} {t.description} {t.debit} > "
+    while account2 is None:
+        account2 = iterfzf(categories, prompt=prompt)
+    transaction.account2 = account2
+    print(f"Assigned category '{account2}'.")
@@ -0,0 +1,121 @@
+import csv
+import datetime
+import logging
+import re
+import sys
+from typing import Dict, List
+
+import toldg.models
+import toldg.predict
+import toldg.utils
+import toldg.write
+from toldg.models import Config, CsvConfig, Transaction
+
+
+def process_ldg_files(config: Config):
+    for ldg_file in toldg.utils.get_ldg_files(config.input_directory):
+        with open(ldg_file, "r") as f_in:
+            with open(config.output_file, "a") as f_out:
+                f_out.write(f_in.read())
+
+
+def get_csv_config(csv_file: str, csv_configs: List[CsvConfig]) -> CsvConfig:
+    cs = [c for c in csv_configs if re.match(c.file_match_regex, csv_file)]
+    if not cs:
+        logging.critical(f"No CSV config for {csv_file}.")
+        sys.exit(1)
+    elif len(cs) > 1:
+        logging.critical(f"Multiple CSV configs for {csv_file}.")
+        sys.exit(1)
+    return cs[0]
+
+
+def get_transactions(csv_file: str, config: CsvConfig) -> List[Transaction]:
+    def date_to_date(date: str) -> str:
+        d = datetime.datetime.strptime(date, config.input_date_format)
+        return d.strftime(config.output_date_format)
+
+    def flip_sign(amount: str) -> str:
+        return amount[1:] if amount.startswith("-") else "-" + amount
+
+    def row_to_transaction(row, fields):
+        """The user can configure the mapping of CSV fields to the three
+        required fields date, amount and description via the CsvConfig."""
+        t = {field: row[index] for index, field in fields}
+        amount = t["amount"]
+        return Transaction(
+            currency=config.currency,
+            debit=flip_sign(amount),
+            credit=amount,
+            date=date_to_date(t["date"]),
+            account1=config.account1,
+            account2=toldg.models.UNKNOWN_CATEGORY,
+            description=t["description"],
+            csv_file=csv_file,
+            row=csv_file + ", " + ", ".join(row),
+        )
+
+    fields = [(i, f) for i, f in enumerate(config.fields) if f]
+    with open(csv_file, "r") as f:
+        reader = csv.reader(f, delimiter=config.delimiter, quotechar=config.quotechar)
+        for _ in range(config.skip):
+            next(reader)
+        transactions = [row_to_transaction(row, fields) for row in reader if row]
+    return transactions
+
+
+def find_duplicates(transactions: List[Transaction]):
+    rows = set()
+    for t in transactions:
+        row = t.row
+        if row in rows:
+            logging.critical(f"'{row}' is duplicated.")
+            logging.critical("Exit because of duplicated transactions.")
+            sys.exit(1)
+        else:
+            rows.add(row)
+
+
+def apply_mappings(transactions: List[Transaction], mappings: Dict[str, str]):
+    unused_mappings = set(mappings.keys())
+    for t in transactions:
+        if t.row in mappings:
+            t.account2 = mappings[t.row]
+            unused_mappings.discard(t.row)
+        else:
+            logging.warning(f"No mapping for '{t}'.")
+    for row in unused_mappings:
+        logging.warning(f"Unused mapping '{row}' -> {mappings[row]}.")
+
+
+def apply_descriptions(transactions: List[Transaction], descriptions: Dict[str, str]):
+    unused_descriptions = set(descriptions.keys())
+    for t in transactions:
+        if t.row in descriptions:
+            t.description = descriptions[t.row]
+            unused_descriptions.discard(t.row)
+    for row in unused_descriptions:
+        logging.warning(f"Unused mapping '{row}' -> {descriptions[row]}.")
+
+
+def process_csv_files(config: Config):
+    csv_files = toldg.utils.get_csv_files(config.input_directory)
+    transactions = []
+    for csv_file in csv_files:
+        csv_file = str(csv_file)
+        csv_config = get_csv_config(csv_file, config.csv_configs)
+        transactions += get_transactions(csv_file, csv_config)
+
+    if config.find_duplicates:
+        find_duplicates(transactions)
+
+    if config.descriptions_file is not None:
+        descriptions = toldg.utils.read_descriptions(config.descriptions_file)
+        apply_descriptions(transactions, descriptions)
+
+    mappings = toldg.utils.read_mappings(config.mappings_file)
+    apply_mappings(transactions, mappings)
+
+    toldg.predict.add_account2(transactions, config.categories)
+    toldg.utils.write_mappings(transactions, config.mappings_file)
+    toldg.write.render_to_file(transactions, config)
@@ -0,0 +1,113 @@
+import json
+import logging
+import os
+import sys
+from pathlib import Path
+from typing import Dict, List
+
+from pydantic import ValidationError
+
+from toldg.models import Config, Transaction
+
+
+def get_files(directory: Path, ending="") -> List[Path]:
+    """Gets files from directory recursively in lexigraphic order."""
+    return [
+        Path(os.path.join(subdir, f))
+        for subdir, _, files in os.walk(directory)
+        for f in files
+        if f.endswith(ending)
+    ]
+
+
+def get_csv_files(directory: Path) -> List[Path]:
+    return get_files(directory, ".csv")
+
+
+def get_ldg_files(directory: Path) -> List[Path]:
+    return get_files(directory, ".ldg")
+
+
+def load_config() -> Config:
+    try:
+        config_file = Path(sys.argv[1])
+    except IndexError:
+        logging.critical("Provide configuration file as first argument.")
+        sys.exit(1)
+
+    try:
+        with open(config_file, "r") as f:
+            config = Config(**json.load(f))
+    except ValidationError as e:
+        logging.critical(f"Could not validate {config_file}.")
+        logging.info(e)
+        sys.exit(1)
+    except FileNotFoundError:
+        logging.critical(f"Could not find {config_file}.")
+        sys.exit(1)
+    return config
+
+
+def category_to_bean(c: str) -> str:
+    sections = map(list, c.split(":"))
+    new_sections = []
+    for section in sections:
+        section[0] = section[0].upper()
+        new_sections.append("".join(section))
+    return ":".join(new_sections)
+
+
+def write_meta(config: Config):
+    with open(config.output_file, "a") as f:
+        for category in config.categories:
+            f.write(f"2017-01-01 open {category_to_bean(category)}\n")
+        f.write("\n")
+        f.write('option "operating_currency" "USD"\n\n')
+
+        # Commodity section is not required for beancount
+        # for commodity in config.commodities:
+        #     f.write(f"commodity {commodity}\n")
+        # f.write("\n")
+
+
+def write_mappings(transactions: List[Transaction], mappings_file: Path):
+    mappings = {}
+    for t in transactions:
+        try:
+            mappings[t.account2.strip()].append(t.row)
+        except KeyError:
+            mappings[t.account2.strip()] = [t.row]
+
+    with open(mappings_file, "w") as f:
+        json.dump({k: sorted(v) for k, v in sorted(mappings.items())}, f, indent=4)
+
+
+def read_mappings(mappings_file: Path) -> Dict[str, str]:
+    with open(mappings_file, "r") as f:
+        account2_to_rows = json.load(f)
+    return {
+        row: category for category, rows in account2_to_rows.items() for row in rows
+    }
+
+
+def read_descriptions(descriptions_file: Path) -> Dict[str, str]:
+    """I am basic so the description file is currently a double row based
+    format where the first row matches the CSV row and the second one is the
+    description."""
+    descriptions = {}
+    current_row = None
+    with open(descriptions_file, "r") as f:
+        for line in f.readlines():
+            if current_row is None:
+                current_row = line.rstrip("\n")
+            else:
+                descriptions[current_row] = line.rstrip("\n")
+                current_row = None
+    return descriptions
+
+
+def remove_if_exists(output_file: Path):
+    try:
+        os.remove(output_file)
+    except OSError:
+        pass
@@ -0,0 +1,32 @@
+from pathlib import Path
+from typing import List
+
+from toldg.models import Config, Transaction
+from toldg.utils import category_to_bean
+
+BEANCOUNT_TRANSACTION_TEMPLATE = """
+{t.date} * "{t.description}"
+    {t.account2:<40}  {t.debit:<6} {t.currency}
+    {t.account1:<40}  {t.credit:<6} {t.currency}
+"""
+
+
+def format(t):
+    t.date = t.date.replace("/", "-")
+    t.description = t.description.replace('"', '\\"')
+    if not t.debit.startswith("-"):
+        t.debit = " " + t.debit
+    if not t.credit.startswith("-"):
+        t.credit = " " + t.credit
+    t.account1 = category_to_bean(t.account1)
+    t.account2 = category_to_bean(t.account2)
+    if t.currency == "EUR":
+        t.debit = t.debit.replace(".", "|").replace(",", ".").replace("|", ",")
+        t.credit = t.credit.replace(".", "|").replace(",", ".").replace("|", ",")
+    return BEANCOUNT_TRANSACTION_TEMPLATE.format(t=t)
+
+
+def render_to_file(transactions: List[Transaction], config: Config):
+    content = "".join(format(t) for t in transactions)
+    with open(config.output_file, "a") as f:
+        f.write(content)