From ba0c906e3c65b8e74357a6970e230450633ca9a2 Mon Sep 17 00:00:00 2001 From: felixm Date: Sat, 24 Jun 2023 22:54:31 +0200 Subject: [PATCH] Refactor ledger processing to explicit mapping which will make automated classfication easy --- Pipfile | 2 + Pipfile.lock | 104 +++++++++++++++++++++++++++++++++++++++++++++++++ README.md | 24 +++++++++++- main.py | 6 --- src/models.py | 60 ++++++++++++++++++++++++++++ src/process.py | 102 ++++++++++++++++++++++++++++++++++++++++++++++++ src/utils.py | 72 ++++++++++++++++++++++++++++++++++ src/write.py | 17 ++++++++ toldg.py | 42 ++++++++++++++++++++ 9 files changed, 421 insertions(+), 8 deletions(-) create mode 100644 Pipfile.lock delete mode 100644 main.py create mode 100644 src/models.py create mode 100644 src/process.py create mode 100644 src/utils.py create mode 100644 src/write.py create mode 100644 toldg.py diff --git a/Pipfile b/Pipfile index 9daa373..5d18211 100644 --- a/Pipfile +++ b/Pipfile @@ -4,6 +4,8 @@ verify_ssl = true name = "pypi" [packages] +rich = "*" +pydantic = "*" [dev-packages] diff --git a/Pipfile.lock b/Pipfile.lock new file mode 100644 index 0000000..b190f1a --- /dev/null +++ b/Pipfile.lock @@ -0,0 +1,104 @@ +{ + "_meta": { + "hash": { + "sha256": "654c54f63f5623a4ee5945b77e4aed25a286f4264d9ff82eb5196e5f23336dca" + }, + "pipfile-spec": 6, + "requires": { + "python_full_version": "3.11.3", + "python_version": "3.11" + }, + "sources": [ + { + "name": "pypi", + "url": "https://pypi.org/simple", + "verify_ssl": true + } + ] + }, + "default": { + "markdown-it-py": { + "hashes": [ + "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1", + "sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb" + ], + "markers": "python_version >= '3.8'", + "version": "==3.0.0" + }, + "mdurl": { + "hashes": [ + "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", + "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba" + ], + "markers": "python_version >= '3.7'", + "version": "==0.1.2" + }, + "pydantic": { + "hashes": [ + "sha256:07293ab08e7b4d3c9d7de4949a0ea571f11e4557d19ea24dd3ae0c524c0c334d", + "sha256:0a2aabdc73c2a5960e87c3ffebca6ccde88665616d1fd6d3db3178ef427b267a", + "sha256:0da48717dc9495d3a8f215e0d012599db6b8092db02acac5e0d58a65248ec5bc", + "sha256:128d9453d92e6e81e881dd7e2484e08d8b164da5507f62d06ceecf84bf2e21d3", + "sha256:2196c06484da2b3fded1ab6dbe182bdabeb09f6318b7fdc412609ee2b564c49a", + "sha256:2e9aec8627a1a6823fc62fb96480abe3eb10168fd0d859ee3d3b395105ae19a7", + "sha256:3283b574b01e8dbc982080d8287c968489d25329a463b29a90d4157de4f2baaf", + "sha256:3c52eb595db83e189419bf337b59154bdcca642ee4b2a09e5d7797e41ace783f", + "sha256:4b466a23009ff5cdd7076eb56aca537c745ca491293cc38e72bf1e0e00de5b91", + "sha256:517a681919bf880ce1dac7e5bc0c3af1e58ba118fd774da2ffcd93c5f96eaece", + "sha256:5f8bbaf4013b9a50e8100333cc4e3fa2f81214033e05ac5aa44fa24a98670a29", + "sha256:6257bb45ad78abacda13f15bde5886efd6bf549dd71085e64b8dcf9919c38b60", + "sha256:67195274fd27780f15c4c372f4ba9a5c02dad6d50647b917b6a92bf00b3d301a", + "sha256:6cafde02f6699ce4ff643417d1a9223716ec25e228ddc3b436fe7e2d25a1f305", + "sha256:73ef93e5e1d3c8e83f1ff2e7fdd026d9e063c7e089394869a6e2985696693766", + "sha256:7845b31959468bc5b78d7b95ec52fe5be32b55d0d09983a877cca6aedc51068f", + "sha256:7847ca62e581e6088d9000f3c497267868ca2fa89432714e21a4fb33a04d52e8", + "sha256:7e1d5290044f620f80cf1c969c542a5468f3656de47b41aa78100c5baa2b8276", + "sha256:7ee829b86ce984261d99ff2fd6e88f2230068d96c2a582f29583ed602ef3fc2c", + "sha256:83fcff3c7df7adff880622a98022626f4f6dbce6639a88a15a3ce0f96466cb60", + "sha256:939328fd539b8d0edf244327398a667b6b140afd3bf7e347cf9813c736211896", + "sha256:95c70da2cd3b6ddf3b9645ecaa8d98f3d80c606624b6d245558d202cd23ea3be", + "sha256:963671eda0b6ba6926d8fc759e3e10335e1dc1b71ff2a43ed2efd6996634dafb", + "sha256:970b1bdc6243ef663ba5c7e36ac9ab1f2bfecb8ad297c9824b542d41a750b298", + "sha256:9863b9420d99dfa9c064042304868e8ba08e89081428a1c471858aa2af6f57c4", + "sha256:ad428e92ab68798d9326bb3e5515bc927444a3d71a93b4a2ca02a8a5d795c572", + "sha256:b48d3d634bca23b172f47f2335c617d3fcb4b3ba18481c96b7943a4c634f5c8d", + "sha256:b9cd67fb763248cbe38f0593cd8611bfe4b8ad82acb3bdf2b0898c23415a1f82", + "sha256:d111a21bbbfd85c17248130deac02bbd9b5e20b303338e0dbe0faa78330e37e0", + "sha256:e1aa5c2410769ca28aa9a7841b80d9d9a1c5f223928ca8bec7e7c9a34d26b1d4", + "sha256:e692dec4a40bfb40ca530e07805b1208c1de071a18d26af4a2a0d79015b352ca", + "sha256:e7c9900b43ac14110efa977be3da28931ffc74c27e96ee89fbcaaf0b0fe338e1", + "sha256:eec39224b2b2e861259d6f3c8b6290d4e0fbdce147adb797484a42278a1a486f", + "sha256:f0b7628fb8efe60fe66fd4adadd7ad2304014770cdc1f4934db41fe46cc8825f", + "sha256:f50e1764ce9353be67267e7fd0da08349397c7db17a562ad036aa7c8f4adfdb6", + "sha256:fab81a92f42d6d525dd47ced310b0c3e10c416bbfae5d59523e63ea22f82b31e" + ], + "index": "pypi", + "version": "==1.10.9" + }, + "pygments": { + "hashes": [ + "sha256:8ace4d3c1dd481894b2005f560ead0f9f19ee64fe983366be1a21e171d12775c", + "sha256:db2db3deb4b4179f399a09054b023b6a586b76499d36965813c71aa8ed7b5fd1" + ], + "markers": "python_version >= '3.7'", + "version": "==2.15.1" + }, + "rich": { + "hashes": [ + "sha256:8f87bc7ee54675732fa66a05ebfe489e27264caeeff3728c945d25971b6485ec", + "sha256:d653d6bccede5844304c605d5aac802c7cf9621efd700b46c7ec2b51ea914898" + ], + "index": "pypi", + "version": "==13.4.2" + }, + "typing-extensions": { + "hashes": [ + "sha256:88a4153d8505aabbb4e13aacb7c486c2b4a33ca3b3f807914a9b4c844c471c26", + "sha256:d91d5919357fe7f681a9f2b5b4cb2a5f1ef0a1e9f59c4d8ff0d3491e05c0ffd5" + ], + "markers": "python_version >= '3.7'", + "version": "==4.6.3" + } + }, + "develop": {} +} diff --git a/README.md b/README.md index 811b5a0..f85fb93 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,9 @@ -# defaultpy +# ledgerai -Default Python project. +Script to transform CSV data into [ledger](https://ledger-cli.org/) accounting +files. + +# Usage Run `pipenv install -dev` to install all packages. @@ -8,3 +11,20 @@ Run `pipenv shell` to get venv shell. Run `pipenv install ` to install a package. +# Architecture + +The script takes a directory in which it recursively searches for CSV and LDG +files. From these files, it generates a single ledger accounting file that +includes all transactions. + +For now, ledger files are simply appended to the output file without +modifications. + +However, the transaction for the CSV files are extended with their *account2* +information, i.e, the category of the transaction. Optionally, these +transactions can also get a more meaningful description and tags. + +The mapping information are stored in a file `mappings.json`. It maps a unique +identifier for each transaction (based on filename, line number) to the +respective *account2*, and (optinally) *tags* or *description. + diff --git a/main.py b/main.py deleted file mode 100644 index fd55dc1..0000000 --- a/main.py +++ /dev/null @@ -1,6 +0,0 @@ -from src import hello - - -if __name__ == "__main__": - hello() - diff --git a/src/models.py b/src/models.py new file mode 100644 index 0000000..8f837b1 --- /dev/null +++ b/src/models.py @@ -0,0 +1,60 @@ +from pydantic import BaseModel, Extra +from typing import List +from pathlib import Path +from typing import List + + +class CsvConfig(BaseModel): + """ + Class to define how to parse a certain CSV file. We use the + file_match_regex attribute to decide whether to apply a config for a file. + If multiple configs match a single file we raise an exception. + """ + class Config: + extra = Extra.forbid + + account1: str + file_match_regex: str + fields: List[str] + input_date_format: str = "%m/%d/%Y" + output_date_format: str = "%Y/%m/%d" + skip: int = 1 + delimiter: str = "," + quotechar: str = "\"" + currency: str = "$" + + +class Config(BaseModel): + """ + Basic class for the configuration of this script. + - input_directory: we search for ldg and csv files recursively here + - output_directory: for all input files we do name.replace(input_directory, + output_directory) + - mappings_directory: directory of CSV mapping files + - csv_configs: configuration for the different input files + """ + class Config: + extra = Extra.forbid + + input_directory: Path + mappings_file: Path + output_file: Path = Path("output.ldg") + csv_configs: List[CsvConfig] + + +class Transaction(BaseModel): + """ + Class for ledger transaction to render into ldg file. + """ + class Config: + extra = Extra.forbid + + currency: str + debit: str + credit: str + date: str + account1: str + account2: str + description: str + csv_file: str + row: str diff --git a/src/process.py b/src/process.py new file mode 100644 index 0000000..1377f9c --- /dev/null +++ b/src/process.py @@ -0,0 +1,102 @@ +import csv +import logging +import re +import sys +import datetime +import src.utils +import src.write +from src.models import Config, CsvConfig, Transaction +from typing import List, Dict + + +def process_ldg_files(config: Config): + for ldg_file in src.utils.get_ldg_files(config.input_directory): + with open(ldg_file, 'r') as f_in: + with open(config.output_file, 'a') as f_out: + f_out.write(f_in.read()) + + +def get_csv_config(csv_file: str, csv_configs: List[CsvConfig]) -> CsvConfig: + cs = [c for c in csv_configs + if re.match(c.file_match_regex, csv_file)] + if not cs: + logging.critical(f"No CSV config for {csv_file}.") + sys.exit(1) + elif len(cs) > 1: + logging.critical(f"Multiple CSV configs for {csv_file}.") + sys.exit(1) + return cs[0] + + +def get_transactions(csv_file: str, config: CsvConfig) -> List[Transaction]: + def date_to_date(date: str) -> str: + d = datetime.datetime.strptime(date, config.input_date_format) + return d.strftime(config.output_date_format) + + def flip_sign(amount: str) -> str: + return amount[1:] if amount.startswith("-") else "-" + amount + + def row_to_transaction(row, fields): + """ The user can configure the mapping of CSV fields to the three + required fields date, amount and description via the CsvConfig. """ + t = {field: row[index] for index, field in fields} + amount = t['amount'] + return Transaction( + currency=config.currency, + debit=flip_sign(amount), + credit=amount, + date=date_to_date(t['date']), + account1=config.account1, + account2="account2", + description=t['description'], + csv_file=csv_file, + row=csv_file + ", " + ", ".join(row)) + + fields = [(i, f) for i, f in enumerate(config.fields) if f] + with open(csv_file, 'r') as f: + reader = csv.reader(f, delimiter=config.delimiter, + quotechar=config.quotechar) + for _ in range(config.skip): + next(reader) + transactions = [row_to_transaction(row, fields) + for row in reader if row] + return transactions + + +def find_duplicates(transactions: List[Transaction]): + rows = set() + for t in transactions: + row = t.row + if row in rows: + logging.critical(f"'{row}' is duplicated.") + logging.critical("Exit because of duplicated transactions.") + sys.exit(1) + else: + rows.add(row) + + +def apply_mappings(transactions: List[Transaction], mappings: Dict[str, str]): + unused_mappings = set(mappings.keys()) + for t in transactions: + if t.row in mappings: + t.account2 = mappings[t.row] + unused_mappings.discard(t.row) + else: + logging.warning(f"No mapping for '{t}'.") + for row in unused_mappings: + logging.warning(f"Unused mapping '{row}' -> {mappings[row]}.") + + +def process_csv_files(config: Config): + csv_files = src.utils.get_csv_files(config.input_directory) + transactions = [] + for csv_file in csv_files: + csv_file = str(csv_file) + csv_config = get_csv_config(csv_file, config.csv_configs) + transactions += get_transactions(csv_file, csv_config) + find_duplicates(transactions) + mappings = src.utils.read_mappings(config.mappings_file) + apply_mappings(transactions, mappings) + src.utils.write_mappings(transactions, config.mappings_file) + src.write.render_to_file(transactions, config.output_file) + diff --git a/src/utils.py b/src/utils.py new file mode 100644 index 0000000..171294c --- /dev/null +++ b/src/utils.py @@ -0,0 +1,72 @@ +import logging +import os +import sys +import logging +import json +from pathlib import Path +from typing import List, Dict +from src.models import Config, Transaction +from pydantic import ValidationError + + +def get_files(directory: Path, ending="") -> List[Path]: + """ Gets files from directory recursively in lexigraphic order. """ + return [Path(os.path.join(subdir, f)) + for subdir, _, files in os.walk(directory) + for f in files + if f.endswith(ending)] + + +def get_csv_files(directory: Path) -> List[Path]: + return get_files(directory, ".csv") + + +def get_ldg_files(directory: Path) -> List[Path]: + return get_files(directory, ".ldg") + + +def load_config() -> Config: + try: + config_file = Path(sys.argv[1]) + except IndexError: + logging.critical("Provide configuration file as first argument.") + sys.exit(1) + + try: + with open(config_file, 'r') as f: + config = Config(**json.load(f)) + except ValidationError as e: + logging.critical(f"Could not validate {config_file}.") + logging.info(e) + sys.exit(1) + except FileNotFoundError: + logging.critical(f"Could not find {config_file}.") + sys.exit(1) + return config + + +def write_mappings(transactions: List[Transaction], mappings_file: Path): + mappings = {} + for t in transactions: + try: + mappings[t.account2.strip()].append(t.row) + except KeyError: + mappings[t.account2.strip()] = [t.row] + + with open(mappings_file, "w") as f: + json.dump({k: sorted(v) for k, v in sorted(mappings.items())}, f, indent=4) + + +def read_mappings(mappings_file: Path) -> Dict[str, str]: + with open(mappings_file, 'r') as f: + account2_to_rows = json.load(f) + return {row: category + for category, rows in account2_to_rows.items() + for row in rows} + + +def remove_if_exists(output_file: Path): + try: + os.remove(output_file) + except OSError: + pass diff --git a/src/write.py b/src/write.py new file mode 100644 index 0000000..046cb9c --- /dev/null +++ b/src/write.py @@ -0,0 +1,17 @@ +from pathlib import Path +from typing import List +from src.models import Transaction + + +LEDGER_TRANSACTION_TEMPLATE = """ +{t.date} {t.description} ; {t.row} + {t.account2} {t.currency} {t.debit} + {t.account1} {t.currency} {t.credit} +""" + + +def render_to_file(transactions: List[Transaction], ledger_file: Path): + content = "".join([LEDGER_TRANSACTION_TEMPLATE.format(t=t) + for t in transactions]) + with open(ledger_file, 'a') as f: + f.write(content) diff --git a/toldg.py b/toldg.py new file mode 100644 index 0000000..d13f484 --- /dev/null +++ b/toldg.py @@ -0,0 +1,42 @@ +import os.path +import csv +import logging +import src.utils +import src.process +from src.models import Transaction +from rich.logging import RichHandler +from typing import List + + +def write_mappings(unmatched_transactions: List[Transaction], mappings_directory: str): + """ Write mappings for unmatched expenses for update by the user. """ + if not unmatched_transactions: + return + fn = os.path.join(mappings_directory, "unmatched.csv") + with open(fn, 'a') as f: + writer = csv.writer(f) + for t in unmatched_transactions: + e = ["expenses", t.description, + f"credit={t.credit};date={t.date}"] + writer.writerow(e) + + +def init_logging(): + logging.basicConfig( + level=logging.INFO, + format="%(message)s", + datefmt="[%X]", + handlers=[RichHandler()], + ) + + +def main(): + init_logging() + config = src.utils.load_config() + src.utils.remove_if_exists(config.output_file) + src.process.process_ldg_files(config) + src.process.process_csv_files(config) + + +if __name__ == "__main__": + main()