From ba0c906e3c65b8e74357a6970e230450633ca9a2 Mon Sep 17 00:00:00 2001
From: felixm <mail@felixm.de>
Date: Sat, 24 Jun 2023 22:54:31 +0200
Subject: [PATCH] Refactor ledger processing to explicit mapping which will
 make automated classfication easy

---
 Pipfile        |   2 +
 Pipfile.lock   | 104 +++++++++++++++++++++++++++++++++++++++++++++++++
 README.md      |  24 +++++++++++-
 main.py        |   6 ---
 src/models.py  |  60 ++++++++++++++++++++++++++++
 src/process.py | 102 ++++++++++++++++++++++++++++++++++++++++++++++++
 src/utils.py   |  72 ++++++++++++++++++++++++++++++++++
 src/write.py   |  17 ++++++++
 toldg.py       |  42 ++++++++++++++++++++
 9 files changed, 421 insertions(+), 8 deletions(-)
 create mode 100644 Pipfile.lock
 delete mode 100644 main.py
 create mode 100644 src/models.py
 create mode 100644 src/process.py
 create mode 100644 src/utils.py
 create mode 100644 src/write.py
 create mode 100644 toldg.py

diff --git a/Pipfile b/Pipfile
index 9daa373..5d18211 100644
--- a/Pipfile
+++ b/Pipfile
@@ -4,6 +4,8 @@ verify_ssl = true
 name = "pypi"
 
 [packages]
+rich = "*"
+pydantic = "*"
 
 [dev-packages]
 
diff --git a/Pipfile.lock b/Pipfile.lock
new file mode 100644
index 0000000..b190f1a
--- /dev/null
+++ b/Pipfile.lock
@@ -0,0 +1,104 @@
+{
+    "_meta": {
+        "hash": {
+            "sha256": "654c54f63f5623a4ee5945b77e4aed25a286f4264d9ff82eb5196e5f23336dca"
+        },
+        "pipfile-spec": 6,
+        "requires": {
+            "python_full_version": "3.11.3",
+            "python_version": "3.11"
+        },
+        "sources": [
+            {
+                "name": "pypi",
+                "url": "https://pypi.org/simple",
+                "verify_ssl": true
+            }
+        ]
+    },
+    "default": {
+        "markdown-it-py": {
+            "hashes": [
+                "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1",
+                "sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb"
+            ],
+            "markers": "python_version >= '3.8'",
+            "version": "==3.0.0"
+        },
+        "mdurl": {
+            "hashes": [
+                "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8",
+                "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"
+            ],
+            "markers": "python_version >= '3.7'",
+            "version": "==0.1.2"
+        },
+        "pydantic": {
+            "hashes": [
+                "sha256:07293ab08e7b4d3c9d7de4949a0ea571f11e4557d19ea24dd3ae0c524c0c334d",
+                "sha256:0a2aabdc73c2a5960e87c3ffebca6ccde88665616d1fd6d3db3178ef427b267a",
+                "sha256:0da48717dc9495d3a8f215e0d012599db6b8092db02acac5e0d58a65248ec5bc",
+                "sha256:128d9453d92e6e81e881dd7e2484e08d8b164da5507f62d06ceecf84bf2e21d3",
+                "sha256:2196c06484da2b3fded1ab6dbe182bdabeb09f6318b7fdc412609ee2b564c49a",
+                "sha256:2e9aec8627a1a6823fc62fb96480abe3eb10168fd0d859ee3d3b395105ae19a7",
+                "sha256:3283b574b01e8dbc982080d8287c968489d25329a463b29a90d4157de4f2baaf",
+                "sha256:3c52eb595db83e189419bf337b59154bdcca642ee4b2a09e5d7797e41ace783f",
+                "sha256:4b466a23009ff5cdd7076eb56aca537c745ca491293cc38e72bf1e0e00de5b91",
+                "sha256:517a681919bf880ce1dac7e5bc0c3af1e58ba118fd774da2ffcd93c5f96eaece",
+                "sha256:5f8bbaf4013b9a50e8100333cc4e3fa2f81214033e05ac5aa44fa24a98670a29",
+                "sha256:6257bb45ad78abacda13f15bde5886efd6bf549dd71085e64b8dcf9919c38b60",
+                "sha256:67195274fd27780f15c4c372f4ba9a5c02dad6d50647b917b6a92bf00b3d301a",
+                "sha256:6cafde02f6699ce4ff643417d1a9223716ec25e228ddc3b436fe7e2d25a1f305",
+                "sha256:73ef93e5e1d3c8e83f1ff2e7fdd026d9e063c7e089394869a6e2985696693766",
+                "sha256:7845b31959468bc5b78d7b95ec52fe5be32b55d0d09983a877cca6aedc51068f",
+                "sha256:7847ca62e581e6088d9000f3c497267868ca2fa89432714e21a4fb33a04d52e8",
+                "sha256:7e1d5290044f620f80cf1c969c542a5468f3656de47b41aa78100c5baa2b8276",
+                "sha256:7ee829b86ce984261d99ff2fd6e88f2230068d96c2a582f29583ed602ef3fc2c",
+                "sha256:83fcff3c7df7adff880622a98022626f4f6dbce6639a88a15a3ce0f96466cb60",
+                "sha256:939328fd539b8d0edf244327398a667b6b140afd3bf7e347cf9813c736211896",
+                "sha256:95c70da2cd3b6ddf3b9645ecaa8d98f3d80c606624b6d245558d202cd23ea3be",
+                "sha256:963671eda0b6ba6926d8fc759e3e10335e1dc1b71ff2a43ed2efd6996634dafb",
+                "sha256:970b1bdc6243ef663ba5c7e36ac9ab1f2bfecb8ad297c9824b542d41a750b298",
+                "sha256:9863b9420d99dfa9c064042304868e8ba08e89081428a1c471858aa2af6f57c4",
+                "sha256:ad428e92ab68798d9326bb3e5515bc927444a3d71a93b4a2ca02a8a5d795c572",
+                "sha256:b48d3d634bca23b172f47f2335c617d3fcb4b3ba18481c96b7943a4c634f5c8d",
+                "sha256:b9cd67fb763248cbe38f0593cd8611bfe4b8ad82acb3bdf2b0898c23415a1f82",
+                "sha256:d111a21bbbfd85c17248130deac02bbd9b5e20b303338e0dbe0faa78330e37e0",
+                "sha256:e1aa5c2410769ca28aa9a7841b80d9d9a1c5f223928ca8bec7e7c9a34d26b1d4",
+                "sha256:e692dec4a40bfb40ca530e07805b1208c1de071a18d26af4a2a0d79015b352ca",
+                "sha256:e7c9900b43ac14110efa977be3da28931ffc74c27e96ee89fbcaaf0b0fe338e1",
+                "sha256:eec39224b2b2e861259d6f3c8b6290d4e0fbdce147adb797484a42278a1a486f",
+                "sha256:f0b7628fb8efe60fe66fd4adadd7ad2304014770cdc1f4934db41fe46cc8825f",
+                "sha256:f50e1764ce9353be67267e7fd0da08349397c7db17a562ad036aa7c8f4adfdb6",
+                "sha256:fab81a92f42d6d525dd47ced310b0c3e10c416bbfae5d59523e63ea22f82b31e"
+            ],
+            "index": "pypi",
+            "version": "==1.10.9"
+        },
+        "pygments": {
+            "hashes": [
+                "sha256:8ace4d3c1dd481894b2005f560ead0f9f19ee64fe983366be1a21e171d12775c",
+                "sha256:db2db3deb4b4179f399a09054b023b6a586b76499d36965813c71aa8ed7b5fd1"
+            ],
+            "markers": "python_version >= '3.7'",
+            "version": "==2.15.1"
+        },
+        "rich": {
+            "hashes": [
+                "sha256:8f87bc7ee54675732fa66a05ebfe489e27264caeeff3728c945d25971b6485ec",
+                "sha256:d653d6bccede5844304c605d5aac802c7cf9621efd700b46c7ec2b51ea914898"
+            ],
+            "index": "pypi",
+            "version": "==13.4.2"
+        },
+        "typing-extensions": {
+            "hashes": [
+                "sha256:88a4153d8505aabbb4e13aacb7c486c2b4a33ca3b3f807914a9b4c844c471c26",
+                "sha256:d91d5919357fe7f681a9f2b5b4cb2a5f1ef0a1e9f59c4d8ff0d3491e05c0ffd5"
+            ],
+            "markers": "python_version >= '3.7'",
+            "version": "==4.6.3"
+        }
+    },
+    "develop": {}
+}
diff --git a/README.md b/README.md
index 811b5a0..f85fb93 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,9 @@
-# defaultpy
+# ledgerai
 
-Default Python project.
+Script to transform CSV data into [ledger](https://ledger-cli.org/) accounting
+files.
+
+# Usage
 
 Run `pipenv install -dev` to install all packages.
 
@@ -8,3 +11,20 @@ Run `pipenv shell` to get venv shell.
 
 Run `pipenv install <package>` to install a package.
 
+# Architecture
+
+The script takes a directory in which it recursively searches for CSV and LDG
+files. From these files, it generates a single ledger accounting file that
+includes all transactions.
+
+For now, ledger files are simply appended to the output file without
+modifications.
+
+However, the transaction for the CSV files are extended with their *account2*
+information, i.e, the category of the transaction. Optionally, these
+transactions can also get a more meaningful description and tags.
+
+The mapping information are stored in a file `mappings.json`. It maps a unique
+identifier for each transaction (based on filename, line number) to the
+respective *account2*, and (optinally) *tags* or *description.
+
diff --git a/main.py b/main.py
deleted file mode 100644
index fd55dc1..0000000
--- a/main.py
+++ /dev/null
@@ -1,6 +0,0 @@
-from src import hello
-
-
-if __name__ == "__main__":
-    hello()
-
diff --git a/src/models.py b/src/models.py
new file mode 100644
index 0000000..8f837b1
--- /dev/null
+++ b/src/models.py
@@ -0,0 +1,60 @@
+from pydantic import BaseModel, Extra
+from typing import List
+from pathlib import Path
+from typing import List
+
+
+class CsvConfig(BaseModel):
+    """
+    Class to define how to parse a certain CSV file. We use the
+    file_match_regex attribute to decide whether to apply a config for a file.
+    If multiple configs match a single file we raise an exception.
+    """
+    class Config:
+        extra = Extra.forbid
+
+    account1: str
+    file_match_regex: str
+    fields: List[str]
+    input_date_format: str = "%m/%d/%Y"
+    output_date_format: str = "%Y/%m/%d"
+    skip: int = 1
+    delimiter: str = ","
+    quotechar: str = "\""
+    currency: str = "$"
+
+
+class Config(BaseModel):
+    """
+    Basic class for the configuration of this script.
+    - input_directory: we search for ldg and csv files recursively here
+    - output_directory: for all input files we do name.replace(input_directory,
+      output_directory)
+    - mappings_directory: directory of CSV mapping files
+    - csv_configs: configuration for the different input files
+    """
+    class Config:
+        extra = Extra.forbid
+
+    input_directory: Path
+    mappings_file: Path
+    output_file: Path = Path("output.ldg")
+    csv_configs: List[CsvConfig]
+
+
+class Transaction(BaseModel):
+    """
+    Class for ledger transaction to render into ldg file.
+    """
+    class Config:
+        extra = Extra.forbid
+
+    currency: str
+    debit: str
+    credit: str
+    date: str
+    account1: str
+    account2: str
+    description: str
+    csv_file: str
+    row: str
diff --git a/src/process.py b/src/process.py
new file mode 100644
index 0000000..1377f9c
--- /dev/null
+++ b/src/process.py
@@ -0,0 +1,102 @@
+import csv
+import logging
+import re
+import sys
+import datetime
+import src.utils
+import src.write
+from src.models import Config, CsvConfig, Transaction
+from typing import List, Dict
+
+
+def process_ldg_files(config: Config):
+    for ldg_file in src.utils.get_ldg_files(config.input_directory):
+        with open(ldg_file, 'r') as f_in:
+            with open(config.output_file, 'a') as f_out:
+                f_out.write(f_in.read())
+
+
+def get_csv_config(csv_file: str, csv_configs: List[CsvConfig]) -> CsvConfig:
+    cs = [c for c in csv_configs
+          if re.match(c.file_match_regex, csv_file)]
+    if not cs:
+        logging.critical(f"No CSV config for {csv_file}.")
+        sys.exit(1)
+    elif len(cs) > 1:
+        logging.critical(f"Multiple CSV configs for {csv_file}.")
+        sys.exit(1)
+    return cs[0]
+
+
+def get_transactions(csv_file: str, config: CsvConfig) -> List[Transaction]:
+    def date_to_date(date: str) -> str:
+        d = datetime.datetime.strptime(date, config.input_date_format)
+        return d.strftime(config.output_date_format)
+
+    def flip_sign(amount: str) -> str:
+        return amount[1:] if amount.startswith("-") else "-" + amount
+
+    def row_to_transaction(row, fields):
+        """ The user can configure the mapping of CSV fields to the three
+        required fields date, amount and description via the CsvConfig. """
+        t = {field: row[index] for index, field in fields}
+        amount = t['amount']
+        return Transaction(
+                currency=config.currency,
+                debit=flip_sign(amount),
+                credit=amount,
+                date=date_to_date(t['date']),
+                account1=config.account1,
+                account2="account2",
+                description=t['description'],
+                csv_file=csv_file,
+                row=csv_file + ", " + ", ".join(row))
+
+    fields = [(i, f) for i, f in enumerate(config.fields) if f]
+    with open(csv_file, 'r') as f:
+        reader = csv.reader(f, delimiter=config.delimiter,
+                               quotechar=config.quotechar)
+        for _ in range(config.skip):
+            next(reader)
+        transactions = [row_to_transaction(row, fields)
+                        for row in reader if row]
+    return transactions
+
+
+def find_duplicates(transactions: List[Transaction]):
+    rows = set()
+    for t in transactions:
+        row = t.row
+        if row in rows:
+            logging.critical(f"'{row}' is duplicated.")
+            logging.critical("Exit because of duplicated transactions.")
+            sys.exit(1)
+        else:
+            rows.add(row)
+
+
+def apply_mappings(transactions: List[Transaction], mappings: Dict[str, str]):
+    unused_mappings = set(mappings.keys())
+    for t in transactions:
+        if t.row in mappings:
+            t.account2 = mappings[t.row]
+            unused_mappings.discard(t.row)
+        else:
+            logging.warning(f"No mapping for '{t}'.")
+    for row in unused_mappings:
+        logging.warning(f"Unused mapping '{row}' -> {mappings[row]}.")
+
+
+def process_csv_files(config: Config):
+    csv_files = src.utils.get_csv_files(config.input_directory)
+    transactions = []
+    for csv_file in csv_files:
+        csv_file = str(csv_file)
+        csv_config = get_csv_config(csv_file, config.csv_configs)
+        transactions += get_transactions(csv_file, csv_config)
+    find_duplicates(transactions)
+    mappings = src.utils.read_mappings(config.mappings_file)
+    apply_mappings(transactions, mappings)
+    src.utils.write_mappings(transactions, config.mappings_file)
+    src.write.render_to_file(transactions, config.output_file)
+
diff --git a/src/utils.py b/src/utils.py
new file mode 100644
index 0000000..171294c
--- /dev/null
+++ b/src/utils.py
@@ -0,0 +1,72 @@
+import logging
+import os
+import sys
+import logging
+import json
+from pathlib import Path
+from typing import List, Dict
+from src.models import Config, Transaction
+from pydantic import ValidationError
+
+
+def get_files(directory: Path, ending="") -> List[Path]:
+    """ Gets files from directory recursively in lexigraphic order. """
+    return [Path(os.path.join(subdir, f))
+            for subdir, _, files in os.walk(directory)
+            for f in files
+            if f.endswith(ending)]
+
+
+def get_csv_files(directory: Path) -> List[Path]:
+    return get_files(directory, ".csv")
+
+
+def get_ldg_files(directory: Path) -> List[Path]:
+    return get_files(directory, ".ldg")
+
+
+def load_config() -> Config:
+    try:
+        config_file = Path(sys.argv[1])
+    except IndexError:
+        logging.critical("Provide configuration file as first argument.")
+        sys.exit(1)
+
+    try:
+        with open(config_file, 'r') as f:
+            config = Config(**json.load(f))
+    except ValidationError as e:
+        logging.critical(f"Could not validate {config_file}.")
+        logging.info(e)
+        sys.exit(1)
+    except FileNotFoundError:
+        logging.critical(f"Could not find {config_file}.")
+        sys.exit(1)
+    return config
+
+
+def write_mappings(transactions: List[Transaction], mappings_file: Path):
+    mappings = {}
+    for t in transactions:
+        try:
+            mappings[t.account2.strip()].append(t.row)
+        except KeyError:
+            mappings[t.account2.strip()] = [t.row]
+
+    with open(mappings_file, "w") as f:
+        json.dump({k: sorted(v) for k, v in sorted(mappings.items())}, f, indent=4)
+
+
+def read_mappings(mappings_file: Path) -> Dict[str, str]:
+    with open(mappings_file, 'r') as f:
+        account2_to_rows = json.load(f)
+    return {row: category
+            for category, rows in account2_to_rows.items()
+            for row in rows}
+
+
+def remove_if_exists(output_file: Path):
+    try:
+        os.remove(output_file)
+    except OSError:
+        pass
diff --git a/src/write.py b/src/write.py
new file mode 100644
index 0000000..046cb9c
--- /dev/null
+++ b/src/write.py
@@ -0,0 +1,17 @@
+from pathlib import Path
+from typing import List
+from src.models import Transaction
+
+
+LEDGER_TRANSACTION_TEMPLATE = """
+{t.date} {t.description} ; {t.row}
+    {t.account2}  {t.currency} {t.debit}
+    {t.account1}  {t.currency} {t.credit}
+"""
+
+
+def render_to_file(transactions: List[Transaction], ledger_file: Path):
+    content = "".join([LEDGER_TRANSACTION_TEMPLATE.format(t=t)
+                       for t in transactions])
+    with open(ledger_file, 'a') as f:
+        f.write(content)
diff --git a/toldg.py b/toldg.py
new file mode 100644
index 0000000..d13f484
--- /dev/null
+++ b/toldg.py
@@ -0,0 +1,42 @@
+import os.path
+import csv
+import logging
+import src.utils
+import src.process
+from src.models import Transaction
+from rich.logging import RichHandler
+from typing import List
+
+
+def write_mappings(unmatched_transactions: List[Transaction], mappings_directory: str):
+    """ Write mappings for unmatched expenses for update by the user. """
+    if not unmatched_transactions:
+        return
+    fn = os.path.join(mappings_directory, "unmatched.csv")
+    with open(fn, 'a') as f:
+        writer = csv.writer(f)
+        for t in unmatched_transactions:
+            e = ["expenses", t.description,
+                 f"credit={t.credit};date={t.date}"]
+            writer.writerow(e)
+
+
+def init_logging():
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(message)s",
+        datefmt="[%X]",
+        handlers=[RichHandler()],
+    )
+
+
+def main():
+    init_logging()
+    config = src.utils.load_config()
+    src.utils.remove_if_exists(config.output_file)
+    src.process.process_ldg_files(config)
+    src.process.process_csv_files(config)
+
+
+if __name__ == "__main__":
+    main()