-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathRBC_Statement_to_CSV.py
executable file
·220 lines (211 loc) · 8.79 KB
/
RBC_Statement_to_CSV.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
import re
import os
import pandas as pd
from pdfminer.high_level import extract_text
def descval(text):
"""from text to list of dollar values and the preceding text (their description)"""
amount = re.compile(r"-?\$\d*,?\d+\.\d{2}")
split = amount.findall(text)
temp = text
dvplist = []
for cost in split:
pair = temp.split(cost, 1)
dvplist.append( (pair[0], cost) )
temp = pair[1]
return dvplist
def condmod(item):
"""looks at a ("description", value) and uses conditionals to return better description"""
"""unfortunately I haven't found a way to make this part neat and intuitive.
Transactions usually have an ID, so I filtered them based on that.
the only exceptions that I've found to this are the inlurance premium and it's tax.
Then there is the metadata that comes with each statement.
in the case of the final and initial balance it's just a straightforward matching.
for the minimum balance there always seems to be a null string associate with an instance of it.
for the credit limit and remaining balance it seems to suffices to look for an associated keyword in the preceding text.
For now I've cut some corners by letting
Availablecredit, Payments&credits, Purchases&debits, Cashadvances, Interest, Fees
be caught by a len < 20 rule. it's arbitrary and error prone so this is where i need to look if something funki is going on.
FOR EXAMPLE CAN CATCH A NONSENSE DESCRIPTION THAT IS <20 LENGTH
or can dismiss an appropriate description that is longer than 20 characters
the latter is however caught by the acctDict function, where
if a long description item has a unique cost associated with it we get a runtime error.
so really I just need to be paranoid about the first case
"""
# Transactions
idmatch = re.search(r"\d{23}", item[0]) # assuming (23 digits = id) and (id iff a transaction)
if idmatch: # transactions first because most common
id = idmatch.group(0)
impure_desc = item[0].split(id)[0]
if len(impure_desc) > 50:
cleaner_desc = impure_desc.split("DATEDATE")[1] # if no datedate, then error
return [cleaner_desc, item[1], True]
return [impure_desc, item[1], True]
# ALL OF THE BELOW TRANSACTIONS CAN BE CRINGE IF AT THE BEGINNING OF THE PAGE.
elif item[0][10:] == "BALANCEPROTECTORPREMIUM" or item[0][10:] == "PROVINCIALTAX" or item[0].find("ANNUALFEE") >= 0:
return [item[0], item[1], True]
elif item[0].find("FIRSTREPORT") >= 0:
return [item[0][0:21], item[1], True]
elif item[0].find("CASHADVANCEINTEREST") >= 0 or item[0].find("PURCHASEINTEREST") >= 0:
return [item[0], item[1], True]
# HENCE THIS. i wish i could avoid this nicely.
elif item[0].endswith("BALANCEPROTECTORPREMIUM"):
n = item[0].find("BALANCEPROTECTORPREMIUM")
return [item[0][n-10:], item[1], True]
# ClosingAccountBalance
elif item[0] == "NEWBALANCE" or item[0] == "ClosingAccountBalance" or item[0] == "CREDITBALANCE":
return ["ClosingBalance", item[1], False]
# PreviousAccountBalance
elif item[0] == "PREVIOUSACCOUNTBALANCE" or item[0] == "PREVIOUSSTATEMENTBALANCE":
return ["PreviousBalance", item[1], False]
# MinimumPayment
elif item[0] == "":
return ["MinimumPayment", item[1], False]
# Availablecredit, Payments&credits, Purchases&debits, Cashadvances, Interest, Fees
elif len(item[0]) < 21:
return [item[0], item[1], False]
# CreditLimit
elif item[0].find("Creditlimit") >= 0:
return ["CreditLimit", item[1], False]
elif item[0].find("Remaining") >= 0:
return ["RemainingBalance", item[1], False]
else:
return [item[0], item[1], False]
def cadtonum(dval):
"""takes a string representing a CAD value and turns it into a float"""
intval = dval.partition('$')
vallst = intval[2].split(',') # 2 entry is number string.
valstr = ""
for g in vallst:
valstr += g
if intval[0]=='':
dval = float(valstr)
elif intval[0]=='-':
dval = -1*float(valstr)
else:
print("ERROR: unexpected input")
return dval
def PDFtoLists(file):
"""Just does all the stuff, to one statement, and returns the important content"""
# first just checking file type
# if os.path.isfile(file):
# print('processing: ', file)
# else:
# raise RuntimeError('not a file')
# if not broken, do stuff
text = extract_text(file) # pdf content into raw string
dv = descval(text)
temp = [] # somewhat refining the dv data
for i in dv:
temp.append(condmod(i))
transactions = []
accountdata = []
for t in temp:
if t[2]:
transactions.append([t[0][10:], cadtonum(t[1]), t[0][0:5], t[0][5:10]])
else:
accountdata.append([t[0], cadtonum(t[1])])
return (transactions, accountdata)
def acctDict(acct):
"""Turns non-transaction entries into a dict of acct infos, after checking the "redundant" entries are clear"""
dux = []
leg = []
for x in acct:
if len(x[0]) > 20:
dux.append(x)
else:
leg.append(x)
# print(len(leg)) # would be nice to make leg a dictionary
for r in dux:
isRedundant = False
for h in leg:
if r[1] == h[1]:
isRedundant = True
# isRedundant = False
if not isRedundant:
if r[0] == "SUBTOTALOFMONTHLYACTIVITY": # only appears when got a new CC.
continue # useless info, except maybe to know CC changed
raise RuntimeError(f"the item:\n{r}\n needs a label")
# if here, no errors were found. so all dux are redundant and thrown out
acctSummary = {} # by making it a dict, repeat values are eliminated
for pair in leg:
acctSummary[pair[0]] = pair[1]
return acctSummary
def reprParsed(trs, vals):
print("TRANSACTIONS")
for i in trs:
print(i)
print("ACCOUNT SUMMARY")
for i in vals:
print(i, vals[i])
def check(acct, trns):
# Sum up transactions according to Summary Categories provided
Payments = 0
Purchases = 0
# CashAdvances = 0
# Interest = 0
# Fees = 0
for trn in trns:
# print(trn)
# if trn[0].find("INTEREST") >= 0:
# Interest += trn[1]
# print(f"{trn} is an interest")
# elif trn[0].find("SERVICECHARGE") >= 0 or trn[0].endswith("ANNUALFEE"):
# Fees += trn[1]
# print(f"{trn} is a Fee")
# elif trn[0].find("ATMCASHADV") >= 0:
# CashAdvances += trn[1]
# print(f"{trn} is a Cash Advance")
if trn[1] > 0:
Purchases += trn[1]
else:
Payments -= trn[1]
# for i in acct:
# print(i, acct[i])
# Check sums match summary data
# if abs(Purchases - acct["Purchases&debits"]) > 0.005:
# print(Purchases)
# print(acct["Purchases&debits"])
# raise RuntimeError("Potentially missed a purchase transaction")
# elif abs(Payments + acct["Payments&credits"]) > 0.005:
# raise RuntimeError("Potentially missed a payment transaction")
# elif abs(CashAdvances - acct["Cashadvances"]) > 0.005:
# raise RuntimeError("Potentially missed a Cash Advance transaction")
# elif abs(Interest - acct["Interest"]) > 0.005:
# raise RuntimeError("Potentially missed an Interest Fee")
# elif abs(Fees - acct["Fees"]) > 0.005:
# raise RuntimeError("Potentially missed a Fee")
# else:
delta = acct["ClosingBalance"] - acct["PreviousBalance"]
allGood = True if abs(delta - (Purchases - Payments)) < 0.005 else False
if allGood:
print("All transactions have been accounted for")
else:
raise RuntimeError("Potentially missed a transaction")
def main(file):
print(file)
transactions, accountdata = PDFtoLists(file)
acctVals = acctDict(accountdata)
check(acctVals, transactions)
# if any mistakes have happened, the above will throw a runtime error
trdf = pd.DataFrame(transactions)
acctdf = pd.Series(acctVals, name='amount')
# print(len(trdf), len(acctdf))
date = re.search(r"\d{4}-\d{2}-\d{2}", file).group(0)
trdf.to_csv(f"../CC_Transaction_CSVs/CC_{date}.csv")
acctdf.to_csv(f"../CC_MetaData_CSVs/CCdata_{date}.csv")
# reprParsed(transactions, acctVals)
return (trdf, acctdf)
# sfile = # put particular statement here
# main(sfile)
# transactions, accountdata = PDFtoLists(sfile)
# print("transasctions")
# for tr in transactions:
# print(tr)
# print("account data")
# for da in accountdata:
# print(da)
# print(len(accountdata))
# actd = acctDict(accountdata)
# for i in actd:
# print(i, actd[i])
# print(f"length of the dict = {len(actd)}")