Add scripts and update readme.

2020-08-10 14:35:57 -04:00
parent 21c6bb2e4d
commit 82e906885a
4 changed files with 438 additions and 1 deletions
@@ -1,3 +1,5 @@
+# Ignore sensitive data
+gather.json
 # ---> Python
 # Byte-compiled / optimized / DLL files
 __pycache__/
@@ -1,3 +1,28 @@
 # ledgerpy

-Scripts to transform different input formats (CSV and OFX) into ledger accounting files. Includes mapping language to update transaction details automatically.
+Scripts to transform different input formats (CSV and OFX) into ledger
+accounting files. Includes mapping language to update transaction details
+automatically.
+
+There are other [scripts](https://github.com/ledger/ledger/wiki/CSV-Import) that
+attempt to handle the same use-cases. I have tried a couple of them, as well as
+hledger's integrated CSV import, and ran into issues or didn't like the
+usability. That's why I wrote my own scripts for my workflow. Probably not too
+useful for anybody else, but I included an example workspace to showcase how I
+use the scripts.
+
+## Dependencies
+
+- jinja2
+- ofxtools
+- python3.8 or higher
+
+## Todo
+
+- [ ] Write this readme
+- [ ] Create setup.py file
+- [ ] Use OFX parser from ofxtools instead of parsing the XML
+- [ ] Autoappend latest OFX data to CSV file
+- [ ] Include example workspace with mock data to demo my workflow
+
+
@@ -0,0 +1,94 @@
+#!/usr/bin/env python3
+
+import datetime
+import ofxtools
+import json
+import logging
+import sys
+import csv
+import xml.etree.ElementTree as ET
+
+from ofxtools import OFXClient
+from ofxtools.Client import StmtRq, CcStmtEndRq, CcStmtRq
+from functools import namedtuple
+
+
+def get_transactions(data):
+    Transaction = namedtuple("Transaction",
+                             ["details", "date", "description",
+                              "amount", "type", "balance", "slip"])
+    root = ET.fromstring(data)
+    ts = []
+    for statement in root.iter("STMTTRN"):
+        description, date, amount = "", "", ""
+        for child in statement:
+            if child.tag == "TRNAMT":
+                amount = child.text
+            elif child.tag == "DTPOSTED":
+                d = datetime.datetime.strptime(child.text[:8], "%Y%m%d")
+                date = d.strftime("%m/%d/%Y")
+            elif child.tag == "NAME":
+                if description:
+                    description = child.text + " " + description
+                else:
+                    description = child.text
+            elif child.tag == "MEMO":
+                if description:
+                    description = description + " " + child.text
+                else:
+                    description = child.text
+        t = Transaction("-", date, description, amount, "-", "-", "-")
+        ts.append(t)
+    return ts
+
+
+def process_account(client, secret, year, name, accttype, acctid, csv_file):
+    dtstart = datetime.datetime(int(year), 1, 1, tzinfo=ofxtools.utils.UTC)
+    dtend = datetime.datetime(int(year), 12, 31, tzinfo=ofxtools.utils.UTC)
+
+    if accttype.upper() in ("CHECKING", "SAVINGS"):
+        rq = StmtRq(acctid=acctid, accttype=accttype.upper(),
+                    dtstart=dtstart, dtend=dtend)
+    else:
+        rq = CcStmtRq(acctid=acctid, dtstart=dtstart, dtend=dtend)
+
+    response = client.request_statements(secret, rq)
+    data = response.read().decode()
+    # with open(csv_file.replace(".csv", ".xml"), "w") as f:
+    #     f.write(data)
+    transactions = get_transactions(data)
+
+    with open(csv_file, "w") as f:
+        csv_writer = csv.writer(f)
+        csv_writer.writerow(["details", "date", "description",
+                             "amount", "type", "balance", "slip"])
+        for t in transactions:
+            csv_writer.writerow(t)
+            #if t.date.startswith(year):
+
+
+def get_client(url, userid, org, fid, clientuid, bankid, version, **kwargs):
+    return OFXClient(url, userid=userid, org=org, fid=fid,
+                     clientuid=clientuid, bankid=bankid, version=version,
+                     prettyprint=True)
+
+
+def main(config):
+    client = get_client(**config["client"])
+    year = config["year"]
+    secret = config["secret"]
+    for account in config["accounts"]:
+        name = account["name"]
+        logging.info(f"Processing {name}.")
+        process_account(client, secret, year, **account)
+
+
+if __name__ == "__main__":
+    try:
+        config_file = sys.argv[1]
+    except IndexError:
+        config_file = "gather.json"
+    with open(config_file, 'r') as f:
+        config = json.load(f)
+    main(config)
+
@@ -0,0 +1,316 @@
+#!/usr/bin/env python3
+
+import json
+import sys
+import csv
+import os.path
+import time
+import re
+import datetime
+import logging
+import jinja2
+import shutil
+import tempfile
+from dataclasses import dataclass, field
+from typing import List, Tuple
+
+
+@dataclass
+class Config:
+    """
+    Basic class for the configuration of this script.
+    - input_directory: we search for ldg and csv files recursively here
+    - output_directory: for all input files we do name.replace(input_directory,
+      output_directory)
+    - mappings_directory: directory of CSV mapping files
+    - csv_configs: configuration for the different input files
+    """
+    input_directory: str
+    output_directory: str
+    mappings_directory: str
+    csv_configs: List
+
+
+@dataclass
+class CsvConfig:
+    """
+    Class to define how to parse a certain CSV file. We use the
+    file_match_regex attribute to decide whether to apply a config for a file.
+    If multiple configs match a single file we raise an exception.
+    """
+    account1: str
+    file_match_regex: str
+    fields: List[str]
+    input_date_format: str = "%m/%d/%Y"
+    output_date_format: str = "%Y/%m/%d"
+    skip: int = 1
+    delimiter: str = ","
+    quotechar: str = "\""
+    currency: str = "$"
+
+
+@dataclass
+class CsvMapping:
+    """
+    Class that defines the account2 attribute for a CSV transaction.
+        description_pattern: string or regexes to match the description
+        specifiers: additonal conditions in the form
+            transaction_attribute=value;another_attribute=value2
+    """
+    mapping_file: str
+    account2: str
+    description_pattern: str
+    specifiers: List[Tuple[str, str]] = field(default_factory=lambda: [])
+
+
+@dataclass
+class LdgTransaction:
+    """
+    Class for ledger transaction to render into ldg file.
+    """
+    currency: str
+    debit: str
+    credit: str
+    date: str
+    account1: str
+    account2: str
+    description: str
+    csv_file: str
+    row: str
+
+
+LEDGER_TRANSACTION_TEMPLATE = """
+{{t.date}} {{t.description}} ; {{t.row}}
+    {{t.account2}}  {{t.currency}} {{t.debit}}
+    {{t.account1}}  {{t.currency}} {{t.credit}}
+
+"""
+
+
+def get_files(input_directory):
+    """ Gets files from directory recursively in lexigraphic order. """
+    return sorted([os.path.join(subdir, f)
+                   for subdir, dirs, files in os.walk(input_directory)
+                   for f in files])
+
+
+def get_mappings(mappings_directory: str) -> List[CsvMapping]:
+
+    def parse_specifiers(s):
+        """ This is a little extra magic I have introduced to specify
+        mappings with more cranularity. The argument s is a string in the form
+
+            attribute1=value1;attribute2=value2;attribute3=value3
+
+        and we want to get it into the form
+
+            [(attribute1, value1), (attribute2, value2), (attribute3, value3)]
+        """
+        r = []
+        for pair in s.split(';'):
+            attr, value = pair.split("=")
+            r.append((attr, value))
+        return r
+
+    def get_mappings_from_file(csv_file):
+        def row_to_mapping(row):
+            pattern = row[1]
+            if pattern.startswith("/") and pattern.endswith("/"):
+                row[1] = re.compile(pattern[1:-1], re.IGNORECASE)
+            if len(row) == 3 and row[2]:
+                row[2] = parse_specifiers(row[2])
+            return CsvMapping(csv_file, *row)
+
+        with open(csv_file, 'r') as f:
+            reader = csv.reader(f, delimiter=',', quotechar='"')
+            # ignore empty lines and comments
+            return [row_to_mapping(row) for row in reader
+                    if row
+                    if not row[0].startswith("#")]
+    return [m
+            for f in get_files(mappings_directory)
+            for m in get_mappings_from_file(f)]
+
+
+def get_transactions(csv_file, config: CsvConfig, mappings: List[CsvMapping]):
+    def date_to_date(date):
+        d = datetime.datetime.strptime(date, config.input_date_format)
+        return d.strftime(config.output_date_format)
+
+    def flip_sign(amount):
+        if amount.startswith("-"):
+            return amount[1:]
+        return "-" + amount
+
+    def make_equal_len(str_1, str_2):
+        max_len = max(len(str_1), len(str_2))
+        str_1 += " " * (max_len - len(str_1))
+        str_2 += " " * (max_len - len(str_2))
+        return (str_1, str_2)
+
+    def get_account2(transaction):
+        t = transaction
+        matching_mappings = []
+        for mapping in mappings:
+            pattern = mapping.description_pattern
+            if type(pattern) is str and pattern == transaction.description:
+                pass
+            elif type(pattern) is re.Pattern and pattern.match(t.description):
+                pass
+            else:
+                continue
+
+            specifiers_match = True
+            for attr, value in mapping.specifiers:
+                if getattr(t, attr) != value:
+                    specifiers_match = False
+
+            if specifiers_match:
+                matching_mappings.append(mapping)
+
+        if not matching_mappings:
+            logging.info(f"No match for {transaction}.")
+            e = f"expenses,{t.description},credit={t.credit};date={t.date}\n"
+            unmatched_expenses.append(e)
+            return "expenses"
+        elif len(matching_mappings) == 1:
+            return matching_mappings[0].account2
+        else:
+            logging.info(
+                f"\nMultiple matches for {transaction}. Picking first.")
+            for m in matching_mappings:
+                logging.info(f"    {m}")
+            return matching_mappings[0].account2
+
+    def row_to_transaction(row):
+        t = {field: row[index] for index, field in fields}
+        amount = t['amount']
+        t = LdgTransaction(config.currency, flip_sign(amount), amount,
+                           date_to_date(t['date']), config.account1,
+                           "", t['description'], csv_file, ", ".join(row))
+        t.account1, t.account2 = make_equal_len(t.account1, get_account2(t))
+        return t
+
+    fields = [(index, field)
+              for index, field in enumerate(config.fields) if field]
+    unmatched_expenses = []
+    with open(csv_file, 'r') as f:
+        reader = csv.reader(f, delimiter=config.delimiter,
+                            quotechar=config.quotechar)
+        [next(reader) for _ in range(config.skip)]
+        transactions = [t
+                        for row in reader
+                        if row
+                        if (t := row_to_transaction(row))
+                        ]
+    return transactions, unmatched_expenses
+
+
+def render_to_file(transactions, csv_file, ledger_file, template_file=""):
+    if template_file:
+        dirname = os.path.dirname(template_file)
+        template_file = os.path.basename(template_file)
+        template_loader = jinja2.FileSystemLoader(searchpath=dirname)
+        template_env = jinja2.Environment(loader=template_loader)
+        template = template_env.get_template(template_file)
+    else:
+        template_env = jinja2.Environment(loader=jinja2.BaseLoader)
+        template = template_env.from_string(LEDGER_TRANSACTION_TEMPLATE)
+
+    # Write transactions into virtual file. We could just create a string
+    # object, but that doesn't work as nicely with the Jinja API plus I think
+    # this approach is faster.
+    tf = tempfile.SpooledTemporaryFile(mode='w+')
+    for t in transactions:
+        tf.write(template.render(t=t))
+    tf.seek(0)
+    new_ledger_content = tf.read()
+
+    status = "no change"
+    if not os.path.isfile(ledger_file):
+        with open(ledger_file, 'w') as f:
+            f.write(new_ledger_content)
+            status = "new"
+    else:
+        with open(ledger_file, 'r') as f:
+            old_ledger_content = f.read()
+            f.close()
+        if new_ledger_content != old_ledger_content:
+            with open(ledger_file, 'w') as f:
+                f.write(new_ledger_content)
+            status = "update"
+    logging.info(f"{csv_file:30} -> {ledger_file:30} | {status}")
+
+
+def main(config):
+    def file_age(file):
+        return time.time() - os.path.getmtime(file)
+
+    def get_csv_config(csv_file: str, csv_configs: List[CsvConfig]) -> CsvConfig:
+        cs = [c for c in csv_configs
+              if re.match(c.file_match_regex, csv_file)]
+        if not cs:
+            raise Exception(f"No config for {csv_file=}.")
+        elif len(cs) > 1:
+            raise Exception(f"More than one config for {csv_file=}.")
+        return cs[0]
+
+    def write_unmatched_expenses(unmatched_expenses, mappings_directory):
+        if not unmatched_expenses:
+            return
+        fn = os.path.join(mappings_directory, "unmatched.csv")
+        with open(fn, 'a') as f:
+            for e in unmatched_expenses:
+                f.write(e)
+
+    def csv_to_ldg_filename(csv_file: str, config: Config):
+        r = csv_file
+        r = r.replace(config.input_directory, config.output_directory)
+        r = r.replace(".csv", ".ldg")
+        return r
+
+    def process_csv_file(csv_file, mappings: List[CsvMapping], config: Config):
+        ledger_file = csv_to_ldg_filename(csv_file, config)
+        csv_config = get_csv_config(csv_file, config.csv_configs)
+
+        transactions, unmatched = get_transactions(
+            csv_file, csv_config, mappings)
+        write_unmatched_expenses(unmatched, config.mappings_directory)
+        render_to_file(transactions, csv_file, ledger_file)
+
+    def process_ldg_file(ldg_file: str, config: Config):
+        dest_file = ldg_file.replace(
+            config.input_directory, config.output_directory)
+        status = "no change"
+        if not os.path.isfile(dest_file):
+            status = "new"
+            shutil.copy(ldg_file, dest_file)
+        if file_age(dest_file) > file_age(ldg_file):
+            shutil.copy(ldg_file, dest_file)
+            status = "update"
+        logging.info(f"{ldg_file:30} -> {dest_file:30} | {status}")
+
+    input_files = get_files(config.input_directory)
+    config.csv_configs = [CsvConfig(**c) for c in config.csv_configs]
+    mappings = get_mappings(config.mappings_directory)
+    for f in input_files:
+        if f.endswith(".csv"):
+            process_csv_file(f, mappings, config)
+        elif f.endswith(".ldg"):
+            process_ldg_file(f, config)
+        else:
+            m = f"Unsupported file type for '{f}'."
+            raise Exception(m)
+
+
+if __name__ == "__main__":
+    logging.basicConfig(stream=sys.stdout,
+                        level=logging.DEBUG,
+                        format='%(message)s')
+    try:
+        config_file = sys.argv[1]
+    except IndexError:
+        config_file = "config.json"
+    with open(config_file, 'r') as f:
+        config = Config(**json.load(f))
+    main(config)