A Typical Project (Livehacking Screenplay: My Own Bookkeeping Effort)¶
Spaghetti Version¶
Contains functions only because necessary (applied with
DataFrame.apply()
)Can imaging that
make_category()
will become worse
INPUTFILE = 'bank.csv'
OUTPUTFILE = 'bank-cat.csv'
import pandas as pd
data = pd.read_csv(
INPUTFILE,
sep=';', encoding='iso-8859-1',
names=('account', 'info', 'time_booked', 'time_valuta', 'amount', 'unit'))
data['time_booked'] = pd.to_datetime(data['time_booked'], format='%d.%m.%Y')
data['time_valuta'] = pd.to_datetime(data['time_valuta'], format='%d.%m.%Y')
def make_float(s):
whole, decimal = s.rsplit(',', maxsplit=1)
whole = whole.replace(',', '')
whole = whole.replace('.', '')
return float(f'{whole}.{decimal}')
data['amount'] = data['amount'].apply(make_float)
def make_category(info):
if info.startswith('Bezahlung Karte'):
which, terminal, organization = info.split('|')
if organization.startswith('ORPHEUM BAR'):
return 'goingout'
if organization.startswith('BILLA DANKT'):
return 'living'
if organization.startswith('HERVIS'):
return 'sport'
if organization.startswith('SHELL'):
return 'car'
return 'card-unknown'
else:
return 'unknown'
data['category'] = data['info'].apply(make_category)
data.to_csv(
OUTPUTFILE,
sep=';', encoding='iso-8859-1',
index=False,
)
A Little Better: Functions (And Commandline Parameters)¶
Dedicated Functions (not only for the purpose of
DataFrame.apply()
)Gives Structure
⟶ Readability
⟶ Steps are obvious
Step 1: reading in ⟶
DataFrame
Step 2: data cleaning
Step 3: categorization
Step 4: writing out
import sys
import pandas as pd
def read_from_csv(csvname):
return pd.read_csv(
csvname,
sep=';', encoding='iso-8859-1',
names=('account', 'info', 'time_booked', 'time_valuta', 'amount', 'unit'))
def write_to_csv(df, csvname):
df.to_csv(
csvname,
sep=';', encoding='iso-8859-1',
index=False,
)
def clean_data(df):
df['time_booked'] = pd.to_datetime(df['time_booked'], format='%d.%m.%Y')
df['time_valuta'] = pd.to_datetime(df['time_valuta'], format='%d.%m.%Y')
def make_float(s):
whole, decimal = s.rsplit(',', maxsplit=1)
whole = whole.replace(',', '')
whole = whole.replace('.', '')
return float(f'{whole}.{decimal}')
df['amount'] = df['amount'].apply(make_float)
return df
def categorize(df):
def make_category(info):
if info.startswith('Bezahlung Karte'):
which, terminal, organization = info.split('|')
if organization.startswith('ORPHEUM BAR'):
return 'goingout'
if organization.startswith('BILLA DANKT'):
return 'living'
if organization.startswith('HERVIS'):
return 'sport'
if organization.startswith('SHELL'):
return 'car'
return 'card-unknown'
else:
return 'unknown'
df['category'] = df['info'].apply(make_category)
return df
INPUTFILE = sys.argv[1]
OUTPUTFILE = sys.argv[2]
data = read_from_csv(INPUTFILE)
data = clean_data(data)
data = categorize(data)
write_to_csv(data, OUTPUTFILE)
Feature Request: Concatenate Input Data From Multiple CSV Files¶
One directory, containing possibly many alphabetic-as-date (e.g.
2022-12.csv
) named filesCommandline arguments: input
[1:-1]
, and output[-1]
“Polymorphic” parameter to
read_from_csv()
:str
or iterableCool 💪: works with both single filename and list of filenames
⟶ works magically
⟶ problem solved
import sys
import pandas as pd
def read_from_csv(csvname_or_list_thereof):
if type(csvname_or_list_thereof) is str:
files = [csvname_or_list_thereof]
else: # must be iterable
files = csvname_or_list_thereof
ret = pd.DataFrame()
for csvname in files:
df = pd.read_csv(
csvname,
sep=';', encoding='iso-8859-1',
names=('account', 'info', 'time_booked', 'time_valuta', 'amount', 'unit'))
ret = pd.concat([ret, df])
return ret
def write_to_csv(df, csvname):
df.to_csv(
csvname,
sep=';', encoding='iso-8859-1',
index=False,
)
def clean_data(df):
df['time_booked'] = pd.to_datetime(df['time_booked'], format='%d.%m.%Y')
df['time_valuta'] = pd.to_datetime(df['time_valuta'], format='%d.%m.%Y')
def make_float(s):
whole, decimal = s.rsplit(',', maxsplit=1)
whole = whole.replace(',', '')
whole = whole.replace('.', '')
return float(f'{whole}.{decimal}')
df['amount'] = df['amount'].apply(make_float)
return df
def categorize(df):
def make_category(info):
if info.startswith('Bezahlung Karte'):
which, terminal, organization = info.split('|')
if organization.startswith('ORPHEUM BAR'):
return 'goingout'
if organization.startswith('BILLA DANKT'):
return 'living'
if organization.startswith('HERVIS'):
return 'sport'
if organization.startswith('SHELL'):
return 'car'
return 'card-unknown'
else:
return 'unknown'
df['category'] = df['info'].apply(make_category)
return df
INPUTFILES = sys.argv[1:-1]
OUTPUTFILE = sys.argv[-1]
data = read_from_csv(INPUTFILES)
data = clean_data(data)
data = categorize(data)
write_to_csv(data, OUTPUTFILE)
Problem created
What if there are files inside the directory that don’t obey the
YYYY-MM.csv
convention?Workaround: put burden on user (not use
data/*.csv
anymore)
Feature Request: Concatenate Input Data From YYYY-MM.csv
Files¶
YYYY-MM.csv
files in specified directoryUse regular expressions (
import re
)⟶ only two commandline parameters
import sys
import os
import re
import pandas as pd
def read_from_csv(csvname_or_list_thereof):
if type(csvname_or_list_thereof) is str:
files = [csvname_or_list_thereof]
else: # must be iterable
files = csvname_or_list_thereof
ret = pd.DataFrame()
for csvname in files:
df = pd.read_csv(
csvname,
sep=';', encoding='iso-8859-1',
names=('account', 'info', 'time_booked', 'time_valuta', 'amount', 'unit'))
ret = pd.concat([ret, df])
return ret
def read_from_dir(dirname):
re_yyyy_mm = re.compile(r'^\d\d\d\d-\d\d\.csv')
names = []
for name in os.listdir(dirname):
if re_yyyy_mm.search(name) is not None:
names.append(os.path.join(dirname, name))
return read_from_csv(names) # <--- filename-list version
def write_to_csv(df, csvname):
df.to_csv(
csvname,
sep=';', encoding='iso-8859-1',
index=False,
)
def clean_data(df):
df['time_booked'] = pd.to_datetime(df['time_booked'], format='%d.%m.%Y')
df['time_valuta'] = pd.to_datetime(df['time_valuta'], format='%d.%m.%Y')
def make_float(s):
whole, decimal = s.rsplit(',', maxsplit=1)
whole = whole.replace(',', '')
whole = whole.replace('.', '')
return float(f'{whole}.{decimal}')
df['amount'] = df['amount'].apply(make_float)
return df
def categorize(df):
def make_category(info):
if info.startswith('Bezahlung Karte'):
which, terminal, organization = info.split('|')
if organization.startswith('ORPHEUM BAR'):
return 'goingout'
if organization.startswith('BILLA DANKT'):
return 'living'
if organization.startswith('HERVIS'):
return 'sport'
if organization.startswith('SHELL'):
return 'car'
return 'card-unknown'
else:
return 'unknown'
df['category'] = df['info'].apply(make_category)
return df
INPUTDIR = sys.argv[1]
OUTPUTFILE = sys.argv[2]
data = read_from_dir(INPUTDIR)
data = clean_data(data)
data = categorize(data)
write_to_csv(data, OUTPUTFILE)
Problem created
We aggregate possibly very many files automatically
How do I test?
$ wc -l data/202*csv
67 data/2022-12.csv
58 data/2023-01.csv
55 data/2023-02.csv
180 total
$ wc -l bank-cat.csv
181 bank-cat.csv
(Fortunately that’s only the CSV column header that
DataFrame.to_csv()
writes out by default. header=False
fixes
that.)
Where Are We? Questions!¶
A number of different ways - policies - to read input data
Data cleaning
Categorization
(Currently) one way to output data
Questions
I’m annoyed by the cycles. Modify, manual test. Libreoffice in the toolchain or what??!
What do I test?
Input?
Cleaning?
Categories?
All in one? Monolithically?
Testability: Input Cleaning¶
Currency conversion
Does the
make_float()
routine work?⟶ factor out from script
⟶ module; lets name it
stuff
⟶
scripts
⟶ project structure
Date conversion
Next: factor out
clean_data()
Test that
⟶ note that the second test is easier to write than the first
from stuff.conversions import make_float, clean_data
import pandas as pd
import numpy as np
def test_make_float():
crap = '2.000,00'
amount = make_float(crap)
assert 1999.9 < amount < 2000.1
def test_date_conversion():
df = pd.DataFrame({
'account': ['AT666666666666666666'],
'info': ['Abbuchung Onlinebanking BG/000009173|BAWAATWWXXX AT211420020010848041|DI Hansjörg Faschingbauer|Ausgleich Firmenkonto'],
'time_booked': ['02.01.2023'],
'time_valuta': ['02.01.2023'],
'amount': ['-400,00'],
'unit': ['EUR']
})
df = clean_data(df)
assert type(df['time_booked'].iloc[0]) is pd.Timestamp
Testability: Reading Input (One Or Multiple Files, Explicitly)¶
test_input.py
stuff/input.py
, containingread_from_csv_single()
read_from_csv_multiple()
Write tests first (don’t touch
scripts/categorize.py
Deliberately omit
encoding
parameter toopen()
⟶ failureRaw strings!
Only finally factor out
read_from_csv()
fromscripts/categorize.py
from stuff.input import read_from_csv_single, read_from_csv_multiple
import pytest
def test_read_csv_single(tmpdir):
lines = [
r'AT666666666666666666;Bezahlung Karte MC/000009284|ORPHEUM BAR 2371 K002 28.02. 21:14|ORPHEUM BAR\\GRAZ\8020 ST;01.03.2023;28.02.2023;-7,40;EUR',
r'AT666666666666666666;95111243 FE/000009283|AT613400000005077508 World4You Internet Services GmbH;28.02.2023;28.02.2023;-29,90;EUR',
]
with open(tmpdir / '2023-02.csv', 'w', encoding='iso-8859-1') as csv:
csv.writelines([l+'\n' for l in lines])
df = read_from_csv_single(tmpdir / '2023-02.csv')
assert df['info'].iloc[0] == r'Bezahlung Karte MC/000009284|ORPHEUM BAR 2371 K002 28.02. 21:14|ORPHEUM BAR\\GRAZ\8020 ST'
assert df['info'].iloc[1] == r'95111243 FE/000009283|AT613400000005077508 World4You Internet Services GmbH'
def test_read_csv_multiple(tmpdir):
lines_2023_02 = [
r'AT666666666666666666;Bezahlung Karte MC/000009284|ORPHEUM BAR 2371 K002 28.02. 21:14|ORPHEUM BAR\\GRAZ\8020 ST;01.03.2023;28.02.2023;-7,40;EUR',
r'AT666666666666666666;95111243 FE/000009283|AT613400000005077508 World4You Internet Services GmbH;28.02.2023;28.02.2023;-29,90;EUR',
]
lines_2023_01 = [
r'AT666666666666666666;Bezahlung Karte MC/000009229|POS 4111 K002 30.01. 19:23|BP TANKSTELLE\\GRAZ\8010;31.01.2023;30.01.2023;-15,75;EUR',
r'AT666666666666666666;120016487112 BG/000009228|BAWAATWWXXX AT081400086210003454|Österreichische Gesundheitskasse;31.01.2023;31.01.2023;-12,43;EUR',
]
with open(tmpdir / '2023-02.csv', 'w', encoding='iso-8859-1') as csv:
csv.writelines([l+'\n' for l in lines_2023_02])
with open(tmpdir / '2023-01.csv', 'w', encoding='iso-8859-1') as csv:
csv.writelines([l+'\n' for l in lines_2023_01])
df = read_from_csv_multiple([tmpdir / '2023-02.csv', tmpdir / '2023-01.csv'])
assert df['info'].iloc[0] == r'Bezahlung Karte MC/000009284|ORPHEUM BAR 2371 K002 28.02. 21:14|ORPHEUM BAR\\GRAZ\8020 ST'
assert df['info'].iloc[1] == r'95111243 FE/000009283|AT613400000005077508 World4You Internet Services GmbH'
assert df['info'].iloc[2] == r'Bezahlung Karte MC/000009229|POS 4111 K002 30.01. 19:23|BP TANKSTELLE\\GRAZ\8010'
assert df['info'].iloc[3] == r'120016487112 BG/000009228|BAWAATWWXXX AT081400086210003454|Österreichische Gesundheitskasse'
import pandas
def read_from_csv_single(csvname):
return pandas.read_csv(
csvname,
sep=';', encoding='iso-8859-1',
names=('account', 'info', 'time_booked', 'time_valuta', 'amount', 'unit'))
def read_from_csv_multiple(csvnames):
ret = pandas.DataFrame()
for csvname in csvnames:
ret = pandas.concat([ret, read_from_csv_single(csvname)])
return ret
Test Sanity: Common Code¶
Tests contain repeated code
⟶ Factor out (write simple function to create one CSV file in one go)
from stuff.input import read_from_csv_single, read_from_csv_multiple
import pytest
def _write_csv(csvname, lines):
with open(csvname, 'w', encoding='iso-8859-1') as csv:
csv.writelines([l+'\n' for l in lines])
def test_read_csv_single(tmpdir):
_write_csv(
tmpdir / '2023-02.csv',
[
r'AT666666666666666666;Bezahlung Karte MC/000009284|ORPHEUM BAR 2371 K002 28.02. 21:14|ORPHEUM BAR\\GRAZ\8020 ST;01.03.2023;28.02.2023;-7,40;EUR',
r'AT666666666666666666;95111243 FE/000009283|AT613400000005077508 World4You Internet Services GmbH;28.02.2023;28.02.2023;-29,90;EUR',
])
df = read_from_csv_single(tmpdir / '2023-02.csv')
assert df['info'].iloc[0] == r'Bezahlung Karte MC/000009284|ORPHEUM BAR 2371 K002 28.02. 21:14|ORPHEUM BAR\\GRAZ\8020 ST'
assert df['info'].iloc[1] == r'95111243 FE/000009283|AT613400000005077508 World4You Internet Services GmbH'
def test_read_csv_multiple(tmpdir):
_write_csv(
tmpdir / '2023-02.csv',
[
r'AT666666666666666666;Bezahlung Karte MC/000009284|ORPHEUM BAR 2371 K002 28.02. 21:14|ORPHEUM BAR\\GRAZ\8020 ST;01.03.2023;28.02.2023;-7,40;EUR',
r'AT666666666666666666;95111243 FE/000009283|AT613400000005077508 World4You Internet Services GmbH;28.02.2023;28.02.2023;-29,90;EUR',
])
_write_csv(
tmpdir / '2023-01.csv',
[
r'AT666666666666666666;Bezahlung Karte MC/000009229|POS 4111 K002 30.01. 19:23|BP TANKSTELLE\\GRAZ\8010;31.01.2023;30.01.2023;-15,75;EUR',
r'AT666666666666666666;120016487112 BG/000009228|BAWAATWWXXX AT081400086210003454|Österreichische Gesundheitskasse;31.01.2023;31.01.2023;-12,43;EUR',
])
df = read_from_csv_multiple([tmpdir / '2023-02.csv', tmpdir / '2023-01.csv'])
assert df['info'].iloc[0] == r'Bezahlung Karte MC/000009284|ORPHEUM BAR 2371 K002 28.02. 21:14|ORPHEUM BAR\\GRAZ\8020 ST'
assert df['info'].iloc[1] == r'95111243 FE/000009283|AT613400000005077508 World4You Internet Services GmbH'
assert df['info'].iloc[2] == r'Bezahlung Karte MC/000009229|POS 4111 K002 30.01. 19:23|BP TANKSTELLE\\GRAZ\8010'
assert df['info'].iloc[3] == r'120016487112 BG/000009228|BAWAATWWXXX AT081400086210003454|Österreichische Gesundheitskasse'
Testability: Reading Input From Directory¶
Finally, factor out
read_from_dir()
Don’t forget to modify
scripts/categorize.py
from stuff.input import read_from_csv_single, read_from_csv_multiple, read_from_csv_dir
import pytest
def _write_csv(csvname, lines):
with open(csvname, 'w', encoding='iso-8859-1') as csv:
csv.writelines([l+'\n' for l in lines])
def test_read_csv_single(tmpdir):
_write_csv(
tmpdir / '2023-02.csv',
[
r'AT666666666666666666;Bezahlung Karte MC/000009284|ORPHEUM BAR 2371 K002 28.02. 21:14|ORPHEUM BAR\\GRAZ\8020 ST;01.03.2023;28.02.2023;-7,40;EUR',
r'AT666666666666666666;95111243 FE/000009283|AT613400000005077508 World4You Internet Services GmbH;28.02.2023;28.02.2023;-29,90;EUR',
])
df = read_from_csv_single(tmpdir / '2023-02.csv')
assert df['info'].iloc[0] == r'Bezahlung Karte MC/000009284|ORPHEUM BAR 2371 K002 28.02. 21:14|ORPHEUM BAR\\GRAZ\8020 ST'
assert df['info'].iloc[1] == r'95111243 FE/000009283|AT613400000005077508 World4You Internet Services GmbH'
def test_read_csv_multiple(tmpdir):
_write_csv(
tmpdir / '2023-02.csv',
[
r'AT666666666666666666;Bezahlung Karte MC/000009284|ORPHEUM BAR 2371 K002 28.02. 21:14|ORPHEUM BAR\\GRAZ\8020 ST;01.03.2023;28.02.2023;-7,40;EUR',
r'AT666666666666666666;95111243 FE/000009283|AT613400000005077508 World4You Internet Services GmbH;28.02.2023;28.02.2023;-29,90;EUR',
])
_write_csv(
tmpdir / '2023-01.csv',
[
r'AT666666666666666666;Bezahlung Karte MC/000009229|POS 4111 K002 30.01. 19:23|BP TANKSTELLE\\GRAZ\8010;31.01.2023;30.01.2023;-15,75;EUR',
r'AT666666666666666666;120016487112 BG/000009228|BAWAATWWXXX AT081400086210003454|Österreichische Gesundheitskasse;31.01.2023;31.01.2023;-12,43;EUR',
])
df = read_from_csv_multiple([tmpdir / '2023-02.csv', tmpdir / '2023-01.csv'])
assert len(df) == 4
assert df['info'].iloc[0] == r'Bezahlung Karte MC/000009284|ORPHEUM BAR 2371 K002 28.02. 21:14|ORPHEUM BAR\\GRAZ\8020 ST'
assert df['info'].iloc[1] == r'95111243 FE/000009283|AT613400000005077508 World4You Internet Services GmbH'
assert df['info'].iloc[2] == r'Bezahlung Karte MC/000009229|POS 4111 K002 30.01. 19:23|BP TANKSTELLE\\GRAZ\8010'
assert df['info'].iloc[3] == r'120016487112 BG/000009228|BAWAATWWXXX AT081400086210003454|Österreichische Gesundheitskasse'
def test_read_csv_dir(tmpdir):
_write_csv(
tmpdir / '2023-02.csv',
[
r'AT666666666666666666;Bezahlung Karte MC/000009284|ORPHEUM BAR 2371 K002 28.02. 21:14|ORPHEUM BAR\\GRAZ\8020 ST;01.03.2023;28.02.2023;-7,40;EUR',
r'AT666666666666666666;95111243 FE/000009283|AT613400000005077508 World4You Internet Services GmbH;28.02.2023;28.02.2023;-29,90;EUR',
])
_write_csv(
tmpdir / 'garbage.csv',
[
r'AT666666666666666666;Bezahlung Karte MC/000009229|POS 4111 K002 30.01. 19:23|BP TANKSTELLE\\GRAZ\8010;31.01.2023;30.01.2023;-15,75;EUR',
r'AT666666666666666666;120016487112 BG/000009228|BAWAATWWXXX AT081400086210003454|Österreichische Gesundheitskasse;31.01.2023;31.01.2023;-12,43;EUR',
])
df = read_from_csv_dir(tmpdir)
assert len(df) == 2
assert df['info'].iloc[0] == r'Bezahlung Karte MC/000009284|ORPHEUM BAR 2371 K002 28.02. 21:14|ORPHEUM BAR\\GRAZ\8020 ST'
assert df['info'].iloc[1] == r'95111243 FE/000009283|AT613400000005077508 World4You Internet Services GmbH'
Tests become larger
Programs become smaller
Testability: Categorization¶
First: all-in-one approach
Write entire CSV file, and see how categorization is
⟶ rip
_write_csv()
out fromtest_input.py
Lets see:
__init__.py
necessary
from stuff.category import categorize
from stuff.input import read_from_csv_single
from stuff.conversions import clean_data
from .testutils import write_csv
def test_category_csv(tmpdir):
write_csv(
tmpdir / 'file.csv',
[
r'AT666666666666666666;Bezahlung Karte MC/000009168|POS 2800 K002 27.12. 09:33|SPAR DANKT 5362\\GRAZ\8020;28.12.2022;27.12.2022;-8,06;EUR',
r'AT666666666666666666;Bezahlung Karte MC/000009169|POS 2801 K002 23.12. 14:02|STRASSENBAHN GRAZ\\GRAZ\8010;28.12.2022;23.12.2022;-2,70;EUR',
]
)
df = read_from_csv_single(tmpdir / 'file.csv')
df = clean_data(df)
df = categorize(df)
assert df['category'].iloc[0] == 'living'
assert df['category'].iloc[1] == 'card-unknown'
Finally, rip out
write_to_csv()
⟶ null length script
Safety Net In Place ⟶ Refactoring Categorization¶
Looking at stuff/category.py
…
Still has those string return values
There will be more
Categories? ⟶ enum — Support for enumerations
Refactoring (continuously testing against safety net)
Pull out inner function
Separation into
DataFrame
concerns and simple string routines⟶ ease of testing? A test still has to formulate an entire CSV for nothing.
Introduce Enum
First class style
When done, switch to functional
Create finer grained tests, with only
info
style strings as input
from enum import Enum
def categorize(df):
df['category'] = df['info'].apply(lambda cat: str_category(category(cat)))
return df
Category = Enum('Category', ('Goingout', 'Living', 'Sport', 'Car', 'CardUnknown', 'Unknown'))
def str_category(cat):
match cat:
case Category.Goingout: return "goingout"
case Category.Living: return "living"
case Category.Sport: return "sport"
case Category.Car: return "Car"
case Category.CardUnknown: return "card-unknown"
case Category.Unknown: return "unknown"
case _: assert False
def category(info):
if info.startswith('Bezahlung Karte'):
which, terminal, organization = info.split('|')
if organization.startswith('ORPHEUM BAR'):
return Category.Goingout
if organization.startswith('BILLA DANKT'):
return Category.Living
if organization.startswith('SPAR DANKT'):
return Category.Living
if organization.startswith('HERVIS'):
return Category.Sport
if organization.startswith('SHELL'):
return Category.Car
if organization.startswith('FLUGHAFEN WIEN'):
return Category.Car
return Category.CardUnknown
else:
return Category.Unknown
from stuff.category import categorize, category, Category
from stuff.input import read_from_csv_single
from stuff.conversions import clean_data
from .testutils import write_csv
def test_category_csv(tmpdir):
write_csv(
tmpdir / 'file.csv',
[
r'AT666666666666666666;Bezahlung Karte MC/000009168|POS 2800 K002 27.12. 09:33|SPAR DANKT 5362\\GRAZ\8020;28.12.2022;27.12.2022;-8,06;EUR',
r'AT666666666666666666;Bezahlung Karte MC/000009169|POS 2801 K002 23.12. 14:02|STRASSENBAHN GRAZ\\GRAZ\8010;28.12.2022;23.12.2022;-2,70;EUR',
]
)
df = read_from_csv_single(tmpdir / 'file.csv')
df = clean_data(df)
df = categorize(df)
assert df['category'].iloc[0] == 'living'
assert df['category'].iloc[1] == 'card-unknown'
def test_more_of_it():
assert Category.Car == category(r'Bezahlung Karte MC/000009153|POS 2802 K002 19.12. 02:00|FLUGHAFEN WIEN PARKEN\\WIEN-FL')
Refactoring Input Reading (⟶ Overengineering)¶
Goal: abstract base class:
InputReader
OO-rewrite input functions from
stuff/input.py
Use in main program: accepts multiple inputs (again), use
RecursiveCSVInputReader
on dir,SingleCSVInputReader
on file⟶
CompositeInputReader
import re
import os
from abc import ABC, abstractmethod
import pandas
class InputReader(ABC):
@abstractmethod
def read(self):
assert False, 'abstract'
class SingleCSVInputReader(InputReader):
def __init__(self, filename):
self.filename = filename
def read(self):
return pandas.read_csv(
self.filename,
sep=';', encoding='iso-8859-1',
names=('account', 'info', 'time_booked', 'time_valuta', 'amount', 'unit'))
class MultipleCSVInputReader(InputReader):
def __init__(self, filenames):
self.readers = [SingleCSVInputReader(fn) for fn in filenames]
def read(self):
ret = pandas.DataFrame()
for rdr in self.readers:
ret = pandas.concat([ret, rdr.read()])
return ret
class RecursiveCSVInputReader(InputReader):
def __init__(self, dirname):
self.dirname = dirname
def read(self):
re_yyyy_mm = re.compile(r'^\d\d\d\d-\d\d\.csv')
names = []
for name in os.listdir(self.dirname):
if re_yyyy_mm.search(name) is not None:
names.append(os.path.join(self.dirname, name))
return MultipleCSVInputReader(names).read()
class CompositeInputReader(InputReader):
def __init__(self, readers):
self.readers = readers
def read(self):
df = pandas.DataFrame()
for rdr in self.readers:
df = pandas.concat([df, rdr.read()])
return df
from .testutils import write_csv
from stuff.input_abc import SingleCSVInputReader, MultipleCSVInputReader, RecursiveCSVInputReader
import pytest
def test_read_csv_single(tmpdir):
write_csv(
tmpdir / '2023-02.csv',
[
r'AT666666666666666666;Bezahlung Karte MC/000009284|ORPHEUM BAR 2371 K002 28.02. 21:14|ORPHEUM BAR\\GRAZ\8020 ST;01.03.2023;28.02.2023;-7,40;EUR',
r'AT666666666666666666;95111243 FE/000009283|AT613400000005077508 World4You Internet Services GmbH;28.02.2023;28.02.2023;-29,90;EUR',
])
rdr = SingleCSVInputReader(tmpdir / '2023-02.csv')
df = rdr.read()
assert df['info'].iloc[0] == r'Bezahlung Karte MC/000009284|ORPHEUM BAR 2371 K002 28.02. 21:14|ORPHEUM BAR\\GRAZ\8020 ST'
assert df['info'].iloc[1] == r'95111243 FE/000009283|AT613400000005077508 World4You Internet Services GmbH'
def test_read_csv_multiple(tmpdir):
write_csv(
tmpdir / '2023-02.csv',
[
r'AT666666666666666666;Bezahlung Karte MC/000009284|ORPHEUM BAR 2371 K002 28.02. 21:14|ORPHEUM BAR\\GRAZ\8020 ST;01.03.2023;28.02.2023;-7,40;EUR',
r'AT666666666666666666;95111243 FE/000009283|AT613400000005077508 World4You Internet Services GmbH;28.02.2023;28.02.2023;-29,90;EUR',
])
write_csv(
tmpdir / '2023-01.csv',
[
r'AT666666666666666666;Bezahlung Karte MC/000009229|POS 4111 K002 30.01. 19:23|BP TANKSTELLE\\GRAZ\8010;31.01.2023;30.01.2023;-15,75;EUR',
r'AT666666666666666666;120016487112 BG/000009228|BAWAATWWXXX AT081400086210003454|Österreichische Gesundheitskasse;31.01.2023;31.01.2023;-12,43;EUR',
])
rdr = MultipleCSVInputReader([tmpdir / '2023-02.csv', tmpdir / '2023-01.csv'])
df = rdr.read()
assert len(df) == 4
assert df['info'].iloc[0] == r'Bezahlung Karte MC/000009284|ORPHEUM BAR 2371 K002 28.02. 21:14|ORPHEUM BAR\\GRAZ\8020 ST'
assert df['info'].iloc[1] == r'95111243 FE/000009283|AT613400000005077508 World4You Internet Services GmbH'
assert df['info'].iloc[2] == r'Bezahlung Karte MC/000009229|POS 4111 K002 30.01. 19:23|BP TANKSTELLE\\GRAZ\8010'
assert df['info'].iloc[3] == r'120016487112 BG/000009228|BAWAATWWXXX AT081400086210003454|Österreichische Gesundheitskasse'
def test_read_csv_dir(tmpdir):
write_csv(
tmpdir / '2023-02.csv',
[
r'AT666666666666666666;Bezahlung Karte MC/000009284|ORPHEUM BAR 2371 K002 28.02. 21:14|ORPHEUM BAR\\GRAZ\8020 ST;01.03.2023;28.02.2023;-7,40;EUR',
r'AT666666666666666666;95111243 FE/000009283|AT613400000005077508 World4You Internet Services GmbH;28.02.2023;28.02.2023;-29,90;EUR',
])
write_csv(
tmpdir / 'garbage.csv',
[
r'AT666666666666666666;Bezahlung Karte MC/000009229|POS 4111 K002 30.01. 19:23|BP TANKSTELLE\\GRAZ\8010;31.01.2023;30.01.2023;-15,75;EUR',
r'AT666666666666666666;120016487112 BG/000009228|BAWAATWWXXX AT081400086210003454|Österreichische Gesundheitskasse;31.01.2023;31.01.2023;-12,43;EUR',
])
rdr = RecursiveCSVInputReader(tmpdir)
df = rdr.read()
assert len(df) == 2
assert df['info'].iloc[0] == r'Bezahlung Karte MC/000009284|ORPHEUM BAR 2371 K002 28.02. 21:14|ORPHEUM BAR\\GRAZ\8020 ST'
assert df['info'].iloc[1] == r'95111243 FE/000009283|AT613400000005077508 World4You Internet Services GmbH'
from stuff.conversions import clean_data
from stuff.input_abc import SingleCSVInputReader, RecursiveCSVInputReader, CompositeInputReader
from stuff.output import write_to_csv
from stuff.category import categorize
import sys
import os
INPUTS = sys.argv[1:-1]
OUTPUTFILE = sys.argv[-1]
readers = []
for i in INPUTS:
if not os.path.exists(i):
assert False
if os.path.isfile(i):
readers.append(SingleCSVInputReader(i))
elif os.path.isdir(i):
readers.append(RecursiveCSVInputReader(i))
else:
assert False
reader = CompositeInputReader(readers)
data = reader.read()
data = clean_data(data)
data = categorize(data)
write_to_csv(data, OUTPUTFILE)