-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextract_spending_data_methods.py
381 lines (327 loc) · 13.8 KB
/
extract_spending_data_methods.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
# extract_spending_data_methods.py
"""Read in a bunch of mint transaction data
Extract it out year by year and "clean" it
by removing income and other transactions
that don't relate to spending
Output a file that just shows spending data
(including "income" that was determined to
be a refund)
"""
import pandas as pd
import sys
# Avoid SettingWithCopyWarning
pd.options.mode.chained_assignment = None # default='warn'
def extract_spending(
mint_df, exclude_spending_group_list, start_after_date, end_before_date
):
"""This method takes a dataframe made up of transactions
that have been exported from mint and returns a dataframe
that includes only the spending related transactions
that occured between the specified start and end dates
It is assumed that prior to calling this method the mint
transaction data has been post-processed to include a
Spending Group category for each transaction. (This can
be done by calling the group_categories method).
After extraction all the transactions for the specified
date range this method will remove all transaction associated
with Spending Groups specified in the exclude_spending_group_list
CSV.
Any credit transactions that remain are assumed to be "refunds"
and are converted to negative debit transactions in the returned
dataframe
"""
# Check that the input mint transaction has been postprocessed
# to group transaction categories into Spending Groups
if "Spending Group" not in mint_df.columns:
raise Exception(
"extract_spending: Input mint transaction data does not have a "
"Spending Group column.\n Pass data to group_transactions method "
"before calling this method"
)
# Extract the transactions for the specified date range
new_df = extract_transactions_by_date_range(
mint_df, start_after_date, end_before_date
)
# Remove all the transactions associated with Spending Groups in the exclude list
new_df = remove_excluded_spending_group(new_df, exclude_spending_group_list)
# Iterate over the remaining groups and remove any income, which at this
# point should be considered as a "refund"
print("\n-------- Analyzing Credits by Spending Group ---------\n")
for group in new_df["Spending Group"].unique():
print("Analyzing credits for spending group: " + group + "....")
new_df[(new_df["Spending Group"] == group)] = new_df[
(new_df["Spending Group"] == group)
].apply(find_refunds, axis=1)
# TODO ? If the net of the refunds is income instead of spending
# should I just remove the group?
# After filtering out "income" that was probably refunds,
# see what type of unexpected income is left. This should not happen (I think...)
income = new_df[new_df["Transaction Type"] == "credit"]
if len(income):
print("\nAfter removing refunds the following income is still unexpected:")
print(income)
print("")
return new_df
def extract_income(
mint_df, exclude_spending_group_list, start_after_date, end_before_date
):
"""This method takes a dataframe made up of transactions
that have been exported from mint and returns a dataframe
that includes only the transactions that are associated
with income generation between the specified
start and end dates
It is assumed that prior to calling this method the mint
transaction data has been post-processed to include a
Spending Group category for each transaction. (This can
be done by calling the group_categories method).
After extracting all the transactions for the specified
date range this method will remove all transaction associated
with Spending Groups specified in the exclude_spending_group_list
CSV.
The remaining transactions are evaluated to determine if
they were income generating. Any transaction data associated
with the "Income" Spending Group is assumed to be income.
All other Spending Groups are analyzed to determine if the
aggregate sum of the transactions during the period were positive.
Spending Groups that resulted in more spending than income
during the period are removed from the data set.
"""
# Extract all transactions in the specified date range
new_df = extract_transactions_by_date_range(
mint_df, start_after_date, end_before_date
)
# Remove all the transactions associated with Spending Groups in the exclude list
new_df = remove_excluded_spending_group(new_df, exclude_spending_group_list)
# Iterate over the remaining groups and remove all the transactions
# for any group whose transactions did not generate income in the aggregate
print("\n------Looking for Spending Groups with Income -------")
for group in new_df["Spending Group"].unique():
if group != "Income":
new_df = analyze_and_remove_non_income(new_df, group, output_analysis=True)
print(
"\nFound "
+ str(len(new_df))
+ " Income related transactions between "
+ start_after_date
+ " and "
+ end_before_date
+ "\n\n"
)
return new_df
def extract_payments_and_income(input_df, spending_group, output_analysis):
"""This helper method returns the sum of the credits and debits
for all the transactions associated with a spending group
"""
# Compare the income (credits) and the spending (debits) for the group
spending_group_df = input_df[input_df["Spending Group"] == spending_group]
payments = spending_group_df[
spending_group_df["Transaction Type"] == "debit"
].Amount.sum()
income = spending_group_df[
spending_group_df["Transaction Type"] == "credit"
].Amount.sum()
# For some categories (ie: Credit Card Payments or Expenses)
# the payments and credits should match or be close.
# If not the data might be a little dirty, so we provide some output that
# allows them to review it and update their mint categorization
if output_analysis and (payments > 0 or income > 0):
print("\nAnalyzing spending_group: " + spending_group + "...")
print(
"Found a total of ${:,.2f}".format(payments)
+ " payments for this time period."
)
print(
"Found a total of ${:,.2f}".format(income)
+ " income for this time period."
)
return (payments, income)
def remove_spending_group(input_df, spending_group, output_analysis=True):
"""This method removes the spending_group from the df
It is used primarily for things that show up both as debits and credits
in Mint, for example Credit Card Payments or Reimbursed Expenses
If the output_analysis parameter is set to True
Print out some info about the credit and debit relationship to allow
user to analyze the data if it seems pretty far off
"""
(payments, income) = extract_payments_and_income(
input_df, spending_group, output_analysis
)
output_df = input_df[input_df["Spending Group"] != spending_group]
if payments > 0 or income > 0:
if output_analysis and (payments - income) > 0:
print(
"Loss of ${:,.2f}".format(payments - income)
+ " is not included in the spending analysis"
)
elif output_analysis and (income - payments) > 0:
print(
"Unexpected(?) income of ${:,.2f}".format(income - payments)
+ " was detected"
)
print(
"After removing "
+ spending_group
+ " related transactions we have ${:,.2f}".format(output_df.Amount.sum())
+ " in transactions."
)
else:
print(
"\nAnalyzing spending_group: "
+ spending_group
+ "... No transactions found"
)
return output_df
def analyze_and_remove_non_income(input_df, spending_group, output_analysis=True):
"""This method analyzes a spending group to see if the sum of the transactions
are postive. For groups that did not generate income the set of transactions
is removed from the data set
If the output_analysis parameter is set to True
Print out some info about the credit and debit relationship to allow
user to analyze the data if it seems pretty far off
"""
# Compare the income (credits) and the spending (debits) for the group
(payments, income) = extract_payments_and_income(
input_df, spending_group, output_analysis
)
if (payments - income) >= 0:
print(
"Spending group "
+ spending_group
+ " did not generate income. Removing it from income data set"
)
return input_df[input_df["Spending Group"] != spending_group]
else:
print(
"Spending group "
+ spending_group
+ " generated income. Keeping it in income data set"
)
# Convert any debits into "negative" expenses in the Income generating
# Spending Group
input_df[(input_df["Spending Group"] == spending_group)] = input_df[
(input_df["Spending Group"] == spending_group)
].apply(find_expenses, axis=1)
return input_df
def group_categories(df, spending_group_defs, show_group_details=False):
"""Add a new "Spending Group" column to group categories
The categories that belong to each spending group are defined
in a CSV file that is passed in via the spending_group_defs
"""
# Create a new "Spending Group" column so that we can start
# grouping related expenses into a single bucket
df["Spending Group"] = df.Category
print(
"Reading spending category definitions from spending from", spending_group_defs
)
try:
group_cats_df = pd.read_csv(spending_group_defs)
except BaseException as e:
print(
"Failed to read spending category definitions from",
spending_group_defs,
file=sys.stderr,
)
print("The exception: {}".format(e), file=sys.stderr)
raise e
# Iterate over spending groups.
for group_name, categories in group_cats_df.items():
categories = categories.dropna()
if show_group_details:
print(
"Setting Spending Group: "
+ group_name
+ " to: "
+ str(categories.tolist())
)
# Assign Spending Group to all transactions with categories for that group
df.loc[df.Category.isin(categories.tolist()), "Spending Group"] = group_name
# print('\nAnalyzing refunds for spending group: '+group_name+'....')
# df[grp] = df[grp].apply(find_refunds, axis=1)
return df
def find_refunds(row):
"""Look for transactions that have a
credit for a typical "spending" category
Change these to a debit with a negative
amount.
This results in the refunds getting subtracted
from the total spend calculations
"""
if row["Transaction Type"] == "credit":
print(
" -- Credit on "
+ row.name.strftime("%m/%d/%Y")
+ " from "
+ row.Description
+ " for ${:,.2f}".format(row.Amount)
)
row.Amount *= -1
row["Transaction Type"] = "debit"
return row
def find_expenses(row):
"""Look for transactions that have a
debit for a typical "income" category
Change these to a credit with a negative
amount.
This results in the expenses getting subtracted
from the total income calculations
"""
if row["Transaction Type"] == "debit":
print(
" -- Expense on "
+ row.name.strftime("%m/%d/%Y")
+ " from "
+ row.Description
+ " for ${:,.2f}".format(row.Amount)
)
row.Amount *= -1
row["Transaction Type"] = "credit"
return row
def extract_transactions_by_date_range(input_df, start_after_date, end_before_date):
"""Helper method to extract all transactions in a date range"""
print(
"Finding transaction data for period > "
+ start_after_date
+ " and < "
+ end_before_date
)
new_df = input_df[
(input_df.index < end_before_date) & (input_df.index > start_after_date)
]
print(
"Found a total of ${:,.2f}".format(new_df.Amount.sum())
+ " in transactions for this time period."
)
return new_df
def remove_excluded_spending_group(df, exclude_spending_group_list):
"""Helper method to remove all transactions in a dataframe
associated with Spendig Groups identified in the exclude
spending group list file passed in as a CSV
The first column in the CSV is the name of the Spending groups
to remove
The second "Hide Analysis" column can optionally be set to false
to reduce the amount of output printed about the details of the
spending and income found in the Spending Group
"""
print(
"Reading categories to extract from spending from", exclude_spending_group_list
)
try:
esg_df = pd.read_csv(exclude_spending_group_list)
except BaseException as e:
print(
"Failed to read list of categories to exclude from spending analysis from",
exclude_spending_group_list,
file=sys.stderr,
)
print(
"The exception: {}".format(e), exclude_spending_group_list, file=sys.stderr
)
raise e
# Remove transactions for each category in the exclude_categories list
print("\n------ Removing Specified Spending Groups ---------")
for i, row in esg_df.iterrows():
if row["Hide Analysis"]:
df = remove_spending_group(df, row["Spending Group"], output_analysis=False)
else:
df = remove_spending_group(df, row["Spending Group"])
return df