Refactor ledger processing to explicit mapping which will make automated classfication easy

This commit is contained in:
felixm 2023-06-24 22:54:31 +02:00
parent b6de0e5514
commit ba0c906e3c
9 changed files with 421 additions and 8 deletions

View File

@ -4,6 +4,8 @@ verify_ssl = true
name = "pypi"
[packages]
rich = "*"
pydantic = "*"
[dev-packages]

104
Pipfile.lock generated Normal file
View File

@ -0,0 +1,104 @@
{
"_meta": {
"hash": {
"sha256": "654c54f63f5623a4ee5945b77e4aed25a286f4264d9ff82eb5196e5f23336dca"
},
"pipfile-spec": 6,
"requires": {
"python_full_version": "3.11.3",
"python_version": "3.11"
},
"sources": [
{
"name": "pypi",
"url": "https://pypi.org/simple",
"verify_ssl": true
}
]
},
"default": {
"markdown-it-py": {
"hashes": [
"sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1",
"sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb"
],
"markers": "python_version >= '3.8'",
"version": "==3.0.0"
},
"mdurl": {
"hashes": [
"sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8",
"sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"
],
"markers": "python_version >= '3.7'",
"version": "==0.1.2"
},
"pydantic": {
"hashes": [
"sha256:07293ab08e7b4d3c9d7de4949a0ea571f11e4557d19ea24dd3ae0c524c0c334d",
"sha256:0a2aabdc73c2a5960e87c3ffebca6ccde88665616d1fd6d3db3178ef427b267a",
"sha256:0da48717dc9495d3a8f215e0d012599db6b8092db02acac5e0d58a65248ec5bc",
"sha256:128d9453d92e6e81e881dd7e2484e08d8b164da5507f62d06ceecf84bf2e21d3",
"sha256:2196c06484da2b3fded1ab6dbe182bdabeb09f6318b7fdc412609ee2b564c49a",
"sha256:2e9aec8627a1a6823fc62fb96480abe3eb10168fd0d859ee3d3b395105ae19a7",
"sha256:3283b574b01e8dbc982080d8287c968489d25329a463b29a90d4157de4f2baaf",
"sha256:3c52eb595db83e189419bf337b59154bdcca642ee4b2a09e5d7797e41ace783f",
"sha256:4b466a23009ff5cdd7076eb56aca537c745ca491293cc38e72bf1e0e00de5b91",
"sha256:517a681919bf880ce1dac7e5bc0c3af1e58ba118fd774da2ffcd93c5f96eaece",
"sha256:5f8bbaf4013b9a50e8100333cc4e3fa2f81214033e05ac5aa44fa24a98670a29",
"sha256:6257bb45ad78abacda13f15bde5886efd6bf549dd71085e64b8dcf9919c38b60",
"sha256:67195274fd27780f15c4c372f4ba9a5c02dad6d50647b917b6a92bf00b3d301a",
"sha256:6cafde02f6699ce4ff643417d1a9223716ec25e228ddc3b436fe7e2d25a1f305",
"sha256:73ef93e5e1d3c8e83f1ff2e7fdd026d9e063c7e089394869a6e2985696693766",
"sha256:7845b31959468bc5b78d7b95ec52fe5be32b55d0d09983a877cca6aedc51068f",
"sha256:7847ca62e581e6088d9000f3c497267868ca2fa89432714e21a4fb33a04d52e8",
"sha256:7e1d5290044f620f80cf1c969c542a5468f3656de47b41aa78100c5baa2b8276",
"sha256:7ee829b86ce984261d99ff2fd6e88f2230068d96c2a582f29583ed602ef3fc2c",
"sha256:83fcff3c7df7adff880622a98022626f4f6dbce6639a88a15a3ce0f96466cb60",
"sha256:939328fd539b8d0edf244327398a667b6b140afd3bf7e347cf9813c736211896",
"sha256:95c70da2cd3b6ddf3b9645ecaa8d98f3d80c606624b6d245558d202cd23ea3be",
"sha256:963671eda0b6ba6926d8fc759e3e10335e1dc1b71ff2a43ed2efd6996634dafb",
"sha256:970b1bdc6243ef663ba5c7e36ac9ab1f2bfecb8ad297c9824b542d41a750b298",
"sha256:9863b9420d99dfa9c064042304868e8ba08e89081428a1c471858aa2af6f57c4",
"sha256:ad428e92ab68798d9326bb3e5515bc927444a3d71a93b4a2ca02a8a5d795c572",
"sha256:b48d3d634bca23b172f47f2335c617d3fcb4b3ba18481c96b7943a4c634f5c8d",
"sha256:b9cd67fb763248cbe38f0593cd8611bfe4b8ad82acb3bdf2b0898c23415a1f82",
"sha256:d111a21bbbfd85c17248130deac02bbd9b5e20b303338e0dbe0faa78330e37e0",
"sha256:e1aa5c2410769ca28aa9a7841b80d9d9a1c5f223928ca8bec7e7c9a34d26b1d4",
"sha256:e692dec4a40bfb40ca530e07805b1208c1de071a18d26af4a2a0d79015b352ca",
"sha256:e7c9900b43ac14110efa977be3da28931ffc74c27e96ee89fbcaaf0b0fe338e1",
"sha256:eec39224b2b2e861259d6f3c8b6290d4e0fbdce147adb797484a42278a1a486f",
"sha256:f0b7628fb8efe60fe66fd4adadd7ad2304014770cdc1f4934db41fe46cc8825f",
"sha256:f50e1764ce9353be67267e7fd0da08349397c7db17a562ad036aa7c8f4adfdb6",
"sha256:fab81a92f42d6d525dd47ced310b0c3e10c416bbfae5d59523e63ea22f82b31e"
],
"index": "pypi",
"version": "==1.10.9"
},
"pygments": {
"hashes": [
"sha256:8ace4d3c1dd481894b2005f560ead0f9f19ee64fe983366be1a21e171d12775c",
"sha256:db2db3deb4b4179f399a09054b023b6a586b76499d36965813c71aa8ed7b5fd1"
],
"markers": "python_version >= '3.7'",
"version": "==2.15.1"
},
"rich": {
"hashes": [
"sha256:8f87bc7ee54675732fa66a05ebfe489e27264caeeff3728c945d25971b6485ec",
"sha256:d653d6bccede5844304c605d5aac802c7cf9621efd700b46c7ec2b51ea914898"
],
"index": "pypi",
"version": "==13.4.2"
},
"typing-extensions": {
"hashes": [
"sha256:88a4153d8505aabbb4e13aacb7c486c2b4a33ca3b3f807914a9b4c844c471c26",
"sha256:d91d5919357fe7f681a9f2b5b4cb2a5f1ef0a1e9f59c4d8ff0d3491e05c0ffd5"
],
"markers": "python_version >= '3.7'",
"version": "==4.6.3"
}
},
"develop": {}
}

View File

@ -1,6 +1,9 @@
# defaultpy
# ledgerai
Default Python project.
Script to transform CSV data into [ledger](https://ledger-cli.org/) accounting
files.
# Usage
Run `pipenv install -dev` to install all packages.
@ -8,3 +11,20 @@ Run `pipenv shell` to get venv shell.
Run `pipenv install <package>` to install a package.
# Architecture
The script takes a directory in which it recursively searches for CSV and LDG
files. From these files, it generates a single ledger accounting file that
includes all transactions.
For now, ledger files are simply appended to the output file without
modifications.
However, the transaction for the CSV files are extended with their *account2*
information, i.e, the category of the transaction. Optionally, these
transactions can also get a more meaningful description and tags.
The mapping information are stored in a file `mappings.json`. It maps a unique
identifier for each transaction (based on filename, line number) to the
respective *account2*, and (optinally) *tags* or *description.

View File

@ -1,6 +0,0 @@
from src import hello
if __name__ == "__main__":
hello()

60
src/models.py Normal file
View File

@ -0,0 +1,60 @@
from pydantic import BaseModel, Extra
from typing import List
from pathlib import Path
from typing import List
class CsvConfig(BaseModel):
"""
Class to define how to parse a certain CSV file. We use the
file_match_regex attribute to decide whether to apply a config for a file.
If multiple configs match a single file we raise an exception.
"""
class Config:
extra = Extra.forbid
account1: str
file_match_regex: str
fields: List[str]
input_date_format: str = "%m/%d/%Y"
output_date_format: str = "%Y/%m/%d"
skip: int = 1
delimiter: str = ","
quotechar: str = "\""
currency: str = "$"
class Config(BaseModel):
"""
Basic class for the configuration of this script.
- input_directory: we search for ldg and csv files recursively here
- output_directory: for all input files we do name.replace(input_directory,
output_directory)
- mappings_directory: directory of CSV mapping files
- csv_configs: configuration for the different input files
"""
class Config:
extra = Extra.forbid
input_directory: Path
mappings_file: Path
output_file: Path = Path("output.ldg")
csv_configs: List[CsvConfig]
class Transaction(BaseModel):
"""
Class for ledger transaction to render into ldg file.
"""
class Config:
extra = Extra.forbid
currency: str
debit: str
credit: str
date: str
account1: str
account2: str
description: str
csv_file: str
row: str

102
src/process.py Normal file
View File

@ -0,0 +1,102 @@
import csv
import logging
import re
import sys
import datetime
import src.utils
import src.write
from src.models import Config, CsvConfig, Transaction
from typing import List, Dict
def process_ldg_files(config: Config):
for ldg_file in src.utils.get_ldg_files(config.input_directory):
with open(ldg_file, 'r') as f_in:
with open(config.output_file, 'a') as f_out:
f_out.write(f_in.read())
def get_csv_config(csv_file: str, csv_configs: List[CsvConfig]) -> CsvConfig:
cs = [c for c in csv_configs
if re.match(c.file_match_regex, csv_file)]
if not cs:
logging.critical(f"No CSV config for {csv_file}.")
sys.exit(1)
elif len(cs) > 1:
logging.critical(f"Multiple CSV configs for {csv_file}.")
sys.exit(1)
return cs[0]
def get_transactions(csv_file: str, config: CsvConfig) -> List[Transaction]:
def date_to_date(date: str) -> str:
d = datetime.datetime.strptime(date, config.input_date_format)
return d.strftime(config.output_date_format)
def flip_sign(amount: str) -> str:
return amount[1:] if amount.startswith("-") else "-" + amount
def row_to_transaction(row, fields):
""" The user can configure the mapping of CSV fields to the three
required fields date, amount and description via the CsvConfig. """
t = {field: row[index] for index, field in fields}
amount = t['amount']
return Transaction(
currency=config.currency,
debit=flip_sign(amount),
credit=amount,
date=date_to_date(t['date']),
account1=config.account1,
account2="account2",
description=t['description'],
csv_file=csv_file,
row=csv_file + ", " + ", ".join(row))
fields = [(i, f) for i, f in enumerate(config.fields) if f]
with open(csv_file, 'r') as f:
reader = csv.reader(f, delimiter=config.delimiter,
quotechar=config.quotechar)
for _ in range(config.skip):
next(reader)
transactions = [row_to_transaction(row, fields)
for row in reader if row]
return transactions
def find_duplicates(transactions: List[Transaction]):
rows = set()
for t in transactions:
row = t.row
if row in rows:
logging.critical(f"'{row}' is duplicated.")
logging.critical("Exit because of duplicated transactions.")
sys.exit(1)
else:
rows.add(row)
def apply_mappings(transactions: List[Transaction], mappings: Dict[str, str]):
unused_mappings = set(mappings.keys())
for t in transactions:
if t.row in mappings:
t.account2 = mappings[t.row]
unused_mappings.discard(t.row)
else:
logging.warning(f"No mapping for '{t}'.")
for row in unused_mappings:
logging.warning(f"Unused mapping '{row}' -> {mappings[row]}.")
def process_csv_files(config: Config):
csv_files = src.utils.get_csv_files(config.input_directory)
transactions = []
for csv_file in csv_files:
csv_file = str(csv_file)
csv_config = get_csv_config(csv_file, config.csv_configs)
transactions += get_transactions(csv_file, csv_config)
find_duplicates(transactions)
mappings = src.utils.read_mappings(config.mappings_file)
apply_mappings(transactions, mappings)
src.utils.write_mappings(transactions, config.mappings_file)
src.write.render_to_file(transactions, config.output_file)

72
src/utils.py Normal file
View File

@ -0,0 +1,72 @@
import logging
import os
import sys
import logging
import json
from pathlib import Path
from typing import List, Dict
from src.models import Config, Transaction
from pydantic import ValidationError
def get_files(directory: Path, ending="") -> List[Path]:
""" Gets files from directory recursively in lexigraphic order. """
return [Path(os.path.join(subdir, f))
for subdir, _, files in os.walk(directory)
for f in files
if f.endswith(ending)]
def get_csv_files(directory: Path) -> List[Path]:
return get_files(directory, ".csv")
def get_ldg_files(directory: Path) -> List[Path]:
return get_files(directory, ".ldg")
def load_config() -> Config:
try:
config_file = Path(sys.argv[1])
except IndexError:
logging.critical("Provide configuration file as first argument.")
sys.exit(1)
try:
with open(config_file, 'r') as f:
config = Config(**json.load(f))
except ValidationError as e:
logging.critical(f"Could not validate {config_file}.")
logging.info(e)
sys.exit(1)
except FileNotFoundError:
logging.critical(f"Could not find {config_file}.")
sys.exit(1)
return config
def write_mappings(transactions: List[Transaction], mappings_file: Path):
mappings = {}
for t in transactions:
try:
mappings[t.account2.strip()].append(t.row)
except KeyError:
mappings[t.account2.strip()] = [t.row]
with open(mappings_file, "w") as f:
json.dump({k: sorted(v) for k, v in sorted(mappings.items())}, f, indent=4)
def read_mappings(mappings_file: Path) -> Dict[str, str]:
with open(mappings_file, 'r') as f:
account2_to_rows = json.load(f)
return {row: category
for category, rows in account2_to_rows.items()
for row in rows}
def remove_if_exists(output_file: Path):
try:
os.remove(output_file)
except OSError:
pass

17
src/write.py Normal file
View File

@ -0,0 +1,17 @@
from pathlib import Path
from typing import List
from src.models import Transaction
LEDGER_TRANSACTION_TEMPLATE = """
{t.date} {t.description} ; {t.row}
{t.account2} {t.currency} {t.debit}
{t.account1} {t.currency} {t.credit}
"""
def render_to_file(transactions: List[Transaction], ledger_file: Path):
content = "".join([LEDGER_TRANSACTION_TEMPLATE.format(t=t)
for t in transactions])
with open(ledger_file, 'a') as f:
f.write(content)

42
toldg.py Normal file
View File

@ -0,0 +1,42 @@
import os.path
import csv
import logging
import src.utils
import src.process
from src.models import Transaction
from rich.logging import RichHandler
from typing import List
def write_mappings(unmatched_transactions: List[Transaction], mappings_directory: str):
""" Write mappings for unmatched expenses for update by the user. """
if not unmatched_transactions:
return
fn = os.path.join(mappings_directory, "unmatched.csv")
with open(fn, 'a') as f:
writer = csv.writer(f)
for t in unmatched_transactions:
e = ["expenses", t.description,
f"credit={t.credit};date={t.date}"]
writer.writerow(e)
def init_logging():
logging.basicConfig(
level=logging.INFO,
format="%(message)s",
datefmt="[%X]",
handlers=[RichHandler()],
)
def main():
init_logging()
config = src.utils.load_config()
src.utils.remove_if_exists(config.output_file)
src.process.process_ldg_files(config)
src.process.process_csv_files(config)
if __name__ == "__main__":
main()