-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanalyzer.py
executable file
·524 lines (437 loc) · 18.3 KB
/
analyzer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
#!/usr/bin/env python3
import os
from os.path import join, split, isdir
import glob
import time
from collections import Counter
from pprint import pprint
# TODO enable these w/ max size either >= all replays or >= all within some
# timeframe, like a season
# TODO TODO but check how much disk space these choices will cost me!
#os.environ['SC2READER_CACHE_DIR'] = "path/to/local/cache"
#os.environ['SC2READER_CACHE_MAX_SIZE'] = 100
import sc2reader
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
REPLAY_EXTENSION = 'SC2Replay'
_replay_dir = None
def replay_dir():
global _replay_dir
if _replay_dir is not None:
return _replay_dir
replay_dir = os.path.expanduser('~/Documents/StarCraft II/Accounts')
# TODO are the two folder names i'm searching for meaningful?
# should i save them as other variables?
# (maybe for auto assigning the player, if other people wanna use this and
# thus my hardcoded `my_name='TheTossBoss'` wouldn't help them...)
# (no attempt to support multiple accounts now)
account_dirs = glob.glob(join(replay_dir, '*/'))
assert len(account_dirs) == 1
account_dir = account_dirs[0]
# not sure how multiple subdirs here differ from above...
# might be a "profile" / server thing
subdirs = [d for d in glob.glob(join(account_dir, '*/'))
if not os.path.dirname(d).endswith('Hotkeys')
]
assert len(subdirs) == 1
# This seems equal to my player.toon_handle parseable at <= load_level=2
# w/ sc2reader (though still not sure what that means)
subdir = subdirs[0]
rdir = join(subdir, 'Replays', 'Multiplayer')
assert isdir(rdir)
_replay_dir = rdir
return rdir
def list_replays(under=None, recursive=True):
"""Lists all replays, or all in folder `under`, if passed.
"""
if under is None:
under = replay_dir()
replay_glob = f'*.{REPLAY_EXTENSION}'
if recursive:
replay_glob = f'**/{replay_glob}'
# TODO does passing recursive=True here cause the '**/...' glob expression
# to behave the same in python <3.5 and python >=3.5? (was intention)
replays = glob.glob(join(under, replay_glob), recursive=recursive)
return replays
# At a minimum, this is probably not goign to be correct going back to games
# where there were different leagues available.
highest_league_num2str = {
1: 'bronze',
2: 'silver',
3: 'gold',
4: 'platinum',
5: 'diamond',
6: 'master',
7: 'grandmaster',
8: 'no_ranking_this_season',
# TODO what is 0? check some bnet profiles to see?
0: 'not_sure'
}
free_units = {
'broodling',
'locust',
'interceptor',
'autoturret',
'infestedterran'
}
# TODO am i missing archon merges somehow?
# TODO need to deal w/ neural parasite changing ownership?
# TODO TODO TODO do the .units lists include events about the units dying and
# stuff (basically, are there any circumstances where units are double counted?)
# eliminate all double counting! (what about the units moving? that counted?)
# TODO TODO see sc2reader docs or make my own test cases, checking that the
# total number of units *created* is at least correct
_real_army_units_seen = set()
def real_army_units(units):
rau = [u for u in units if u.is_army and not u.hallucinated
and u.title != 'Overlord' and u.title.lower() not in free_units
and not u.title.endswith('Burrowed') and not u.title.endswith('Cocoon')
and u.title != 'SiegeTankSieged' and u.title != 'WarpPrismPhasing'
and u.title != 'VikingAssault' and u.title != 'BattleHellion'
]
for u in rau:
_real_army_units_seen.add(u.title)
return rau
unit2abbreviation = {
'battlehellion': 'hellbat',
# TODO test these keys are all correct and not missing spaces or something
'darktemplar': 'DT',
'zergling': 'ling',
'mutalisk': 'muta',
'immortal': 'immo',
'warpprism': 'prism',
}
def abbreviate_unit(unit):
unit = unit.lower()
if unit in unit2abbreviation:
return unit2abbreviation[unit]
else:
return unit
# TODO maybe count just what exists when the game ends?
# (might help to quickly see what they beat you with, unless units sitting
# at home add noise...)
def count_army_units(army_units):
# Throwing away the time information, etc.
army_unit_abbrevs = [abbreviate_unit(u.title) for u in army_units]
abbrev2representative_unit = dict()
for abbrev, unit in zip(army_unit_abbrevs[::-1], army_units[::-1]):
abbrev2representative_unit[abbrev] = unit
return Counter(army_unit_abbrevs), abbrev2representative_unit
def most_common_army_unit(army_units):
if len(army_units) == 0:
return None
counter, _ = count_army_units(army_units)
return counter.most_common(1)[0][0]
# TODO maybe include upgrades in something like this / in another str
def army_summary_str(army_units, scale_by='supply', n=3):
if len(army_units) == 0:
return None
counter, abbrev2representative_unit = count_army_units(army_units)
unit_names = []
counts = []
unit_name2idx = dict()
for i, (u, c) in enumerate(counter.items()):
unit_names.append(u)
counts.append(c)
unit_name2idx[u] = i
counts = np.array(counts)
if scale_by == 'number':
numerators = counts
elif scale_by == 'supply':
supplies = np.array([
abbrev2representative_unit[a].supply for a in unit_names
])
numerators = counts * supplies
# TODO maybe also consider weighting by resource costs?
else:
raise NotImplementedError(f'scale_by={scale_by} not supported')
fractions = numerators / np.sum(numerators)
# TODO maybe some elbow finding method to pick n highest to use?
summary_str_parts = []
remaining = 1.0
for u, _ in counter.most_common(n):
i = unit_name2idx[u]
p = fractions[i]
remaining -= p
summary_str_parts.append(f'{p:.0%} {u}')
delimiter = ', '
summary_str = delimiter.join(summary_str_parts)
if remaining > 0.03:
summary_str += f' ({remaining:.0%} other)'
return summary_str
# TODO TODO how to restrict all replays to just 1v1 ranked (or just 1v1 ranked /
# unranked, if that's much easier)?
def load_1v1_ranked(paths, my_name, analyze_units=True):
# TODO maybe just use load_players or something else to just get the data i
# want, rather than load_level=2, which also loads other stuff
# (though idk if any of other stuff totals to significant time loading...)
# Technically possible to load all at load_level=1 first and check
# .type == '1v1', but 1) most of my games are 1v1 anyway and
# 2) loading all at load_level=1 or load_level=2 seems to take about the
# same amount of time, so unlikely that any such strategy would help much,
# as far as total load time is concerned.
if analyze_units:
load_level = 3
else:
load_level = 2
before = time.time()
print(f'Loading {len(paths)} replays at load_level={load_level} ...')
replays = sc2reader.load_replays(paths, load_level=load_level)
shortcircuit = True
n_with_computers = 0
n_ladder = 0
n_1v1 = 0
n_competitive = 0
# If the value for a key is `None` here, the variable is assumed to be
# accessible as an attribute of the replay object.
game_vars2getters = {
'filename': None,
'start_time': None,
# TODO matter whether i use this .real_length over just .length?
'duration_s': lambda r: r.real_length.total_seconds(),
'map_name': None,
# The list of players indexed here is the winning team, which should
# always just have one person given the restrictions earlier in this
# function.
# TODO elsewhere assert name is unique in 1v1
'won': lambda r: r.winner.players[0] == my_name,
'expansion': None,
# TODO could also get maybe build / base_build, though not sure if they
# map to the balance patches i might care about or how to do that
# mapping
}
# TODO TODO get variable representing the season
my_prefix = 'my_'
opponent_prefix = 'opponent_'
# If the value for a key is `None` here, the variable is assumed to be
# accessible as an attribute of the player (`sc2reader.objects.Participant`)
# object.
my_vars2getters = {
'pick_race': None,
'play_race': None,
# this is just to see if variation in this for my opponents is maybe
# just b/c the mapping changed and thus my subregion also changed
'subregion': None,
# MMR before match. Not sure if "scaled" actually has any meaning here.
'scaled_rating': lambda p: p.init_data['scaled_rating'],
}
opponent_vars2getters = {
'pick_race': None,
'play_race': None,
'name': None,
'clan_tag': None,
# TODO get which sub league they are in? that vary, or do i always play
# people within mine?
# TODO does subregion actually vary for my opponents? drop if not.
'subregion': None,
# TODO are other IDs (like battle tag) avaible directly from replay?
# Can be used to generate URL to their BNet profile
# (directly accessible w/ player.url)
'bnet_uid': lambda o: o.detail_data['bnet']['uid'],
'highest_league': lambda p: highest_league_num2str[
getattr(p, 'highest_league')
],
'scaled_rating': lambda p: p.init_data['scaled_rating'],
# Chat will be handled separately.
}
# Has one key for each in the three dicts above (though player variables
# will be prefixed), and the values for each should be same-length lists
# of length equal to the number of replays analyzed.
var2value_lists = {v: [] for v in (
list(game_vars2getters.keys()) +
[my_prefix + k for k in my_vars2getters.keys()] +
[opponent_prefix + k for k in opponent_vars2getters.keys()] +
[p + 'chat' for p in (my_prefix, opponent_prefix)]
)}
if analyze_units:
var2value_lists[my_prefix + 'mode_army_unit'] = []
var2value_lists[opponent_prefix + 'mode_army_unit'] = []
var2value_lists[my_prefix + 'army_summary'] = []
var2value_lists[opponent_prefix + 'army_summary'] = []
# TODO need to separately filter out custom 1v1 games against humans,
# or do is_ladder / competitive already effectively filter that?
# (look at whats get filtered and check for my recent games w/ brian for
# instance...)
for replay in tqdm(replays, total=len(paths)):
if len(replay.computers) > 0:
n_with_computers += 1
if shortcircuit:
continue
# TODO seems ladder might already only select 1v1s?
if not replay.is_ladder:
if shortcircuit:
continue
else:
assert replay.competitive
n_ladder += 1
# TODO players include AI? (if not, this might catch non-1v1 games...)
if len(replay.players) != 2:
if shortcircuit:
continue
else:
assert replay.type == '1v1'
n_1v1 += 1
# TODO i have definitely played *some* 1v1 unranked games, so it seems
# this must be True in those cases as well...
assert replay.competitive
n_competitive += 1
# TODO why is this always None? what is this for?
# TODO check it when fully loaded
assert replay.ranked is None
assert replay.players[0].name != replay.players[1].name
if replay.players[0].name == my_name:
my_idx = 0
opponent_idx = 1
elif replay.players[1].name == my_name:
my_idx = 1
opponent_idx = 0
else:
raise ValueError('no player with name matching my_name={my_name}')
me = replay.players[my_idx]
opponent = replay.players[opponent_idx]
for var, getter in game_vars2getters.items():
if getter is None:
value = getattr(replay, var)
else:
value = getter(replay)
var2value_lists[var].append(value)
for var, getter in my_vars2getters.items():
if getter is None:
value = getattr(me, var)
else:
value = getter(me)
var2value_lists[my_prefix + var].append(value)
for var, getter in opponent_vars2getters.items():
if getter is None:
value = getattr(opponent, var)
else:
value = getter(opponent)
var2value_lists[opponent_prefix + var].append(value)
# Handling chat separately cause it's not most easily accessible as a fn
# of the player objects it seems.
my_chat = []
their_chat = []
for m in replay.messages:
if not m.to_all:
continue
# might want to also store m.frame to re-order later
# (if i want that)
if m.player.name == my_name:
my_chat.append(m.text)
else:
assert m.player.name == opponent.name
their_chat.append(m.text)
var2value_lists[my_prefix + 'chat'].append(my_chat)
var2value_lists[opponent_prefix + 'chat'].append(their_chat)
if analyze_units:
opponent_army_units = real_army_units(opponent.units)
opponent_mode_army_unit = most_common_army_unit(opponent_army_units)
opponent_army_summary = army_summary_str(opponent_army_units)
var2value_lists[opponent_prefix + 'mode_army_unit'].append(
opponent_mode_army_unit
)
var2value_lists[opponent_prefix + 'army_summary'].append(
opponent_army_summary
)
my_army_units = real_army_units(me.units)
my_mode_army_unit = most_common_army_unit(my_army_units)
my_army_summary = army_summary_str(my_army_units)
var2value_lists[my_prefix + 'mode_army_unit'].append(
my_mode_army_unit
)
var2value_lists[my_prefix + 'army_summary'].append(my_army_summary)
'''
print('n_1v1:', n_1v1)
print('n_with_computers:', n_with_computers)
print('n_ladder:', n_ladder)
print('n_competitive:', n_competitive)
'''
# TODO why does even load_level=4 seem to not correctly specify
# replay.ranked? need to do some other init? is that the flag i want?
# TODO how does .ranked differ from .competitive?
# TODO compare what i can get out of 3 vs 4.
'''
for path in tqdm(paths_to_fully_load, total=len(paths_to_fully_load)):
r3 = sc2reader.load_replay(path, load_level=3)
r4 = sc2reader.load_replay(path, load_level=4)
# player.units something i want here?
import ipdb; ipdb.set_trace()
'''
# TODO TODO would it take too much memory to store full load_level 3/4
# replay objects to each? store some more compact repr of build orders
# in a column too?
'''
load_level = 4
print(f'Loading {len(paths)} replays at load_level={load_level} ...')
l4_replays = sc2reader.load_replays(paths, load_level=load_level)
'''
# TODO TODO maybe derive columns for whether i used hotkeys for various
# things (particularly (all/most of) my bases)
total_s = time.time() - before
print(f'Loading took {total_s:.0f}s')
df = pd.DataFrame(var2value_lists)
n_replays_before = len(df)
df.drop_duplicates(subset=['start_time'], inplace=True)
n_dropped = n_replays_before - len(df)
if n_dropped > 0:
print(f'Dropped {n_dropped} replays with duplicate start_time')
df.set_index('start_time', inplace=True)
df.sort_index(inplace=True)
return df
def game_fraction_familiar_opponent(df):
"""
Assumes `df` rows are already sorted by `start_time`, with older games
in earlier rows.
"""
played_opponent_before = df['opponent_bnet_uid'].duplicated(keep='first')
frac_familiar_series = \
played_opponent_before.cumsum() / (np.arange(len(df)) + 1)
fig, ax = plt.subplots()
plt.plot(frac_familiar_series)
ax.set_title('Fraction of games against familiar opponents')
plt.show()
# TODO varying lookbehind windows? (i initially described a lookbehind
# window that extends back to time 0=first replay, but maybe try some that
# are like weeks / months long?)
return frac_familiar_series
# TODO TODO TODO try to group / cluster builds into a set of existing ones (or
# learn the clusters from the data) (but maybe try clustering into some popular
# spawningtool builds for instance?) (maybe just start w/ a variable that is the
# most common non-worker unit they built?)
# TODO TODO TODO maybe come up w/ some statistic representing how aggressive a
# certain player is in a certain game? (maybe compute something like how many
# minutes until first army unit sent across / attacking structure built?) or
# some statistic that tries to compute time to first peak in opponent resources
# lost (though defending would also increase this...)? when they build first
# expansion (if they do) / early tech structures that might indicate aggression?
# TODO TODO see whether any of the statistics in the spawningtool code seem
# useful for my analysis
def main():
paths = list_replays()
'''
path = paths[0]
# load_level=4 is the default (loads everything except map, which has a
# separate flag to load)
# load_level=2 is the minimum that has players
r0 = sc2reader.load_replay(path, load_level=0)
r1 = sc2reader.load_replay(path, load_level=1)
r2 = sc2reader.load_replay(path, load_level=2)
# r3 also loads "tracker" events, which might not be that useful alone?
# (or could be faster way to get a lot of the game state i care about...)
r4 = sc2reader.load_replay(path, load_level=4)
'''
#import ipdb; ipdb.set_trace()
my_name = 'TheTossBoss'
# TODO TODO TODO cache this df (particularly if it ends up using
# load_level=4 to get build order info)
# TODO though see what caching is available in sc2reader, as it seems they
# have some (see commented env var stuff above sc2reader import)
df = load_1v1_ranked(paths, my_name)
frac_familiar = game_fraction_familiar_opponent(df)
#print('"real" army units seen:')
#pprint(_real_army_units_seen)
import ipdb; ipdb.set_trace()
if __name__ == '__main__':
main()