Skip to content

Commit

Permalink
add the script to find+remove duplicates
Browse files Browse the repository at this point in the history
  • Loading branch information
shlomif committed Nov 12, 2024
1 parent a510a55 commit 810a89e
Showing 1 changed file with 54 additions and 0 deletions.
54 changes: 54 additions & 0 deletions fortune-mod/util/find_duplicate_fortunes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#! /usr/bin/python
'''
python3 util/find_duplicate_fortunes.py \
$(git ls ./datfiles/ | grep -vE 'CMa|/data/' |
perl -E '@l=(<>);sub aa{return shift()=~m#^datfiles/off#ms;};
@o=sort{(aa($a)<=>aa($b)) or ($a cmp $b)}@l;say@o;'
)
'''
import sys

locations_by_text = {}

for filename in sys.argv:
with open(filename) as fh:
text = ""
startlineno = 1

for lineno, line in enumerate(fh, 1):
if line == "%\n":
if text not in locations_by_text:
locations_by_text[text] = []
locations_by_text[text].append((filename, startlineno, lineno))
text = ""
startlineno = lineno + 1
else:
text += line

if text:
if text not in locations_by_text:
locations_by_text[text] = []
locations_by_text[text].append((filename, startlineno, lineno))

byfn = {}
for text, locations in locations_by_text.items():
if len(locations) > 1:
print(f"Multiple occurrences of '{text.__repr__()[:60]}':")
for filename, startlineno, lineno in locations[1:]:
if filename not in byfn:
byfn[filename] = []
byfn[filename].append((startlineno, lineno))
# print(f"{filename}:{startlineno}:{lineno}")

for filename, matches in byfn.items():
m = list(reversed(sorted(matches)))
print(filename, m)
with open(filename) as fh:
lines = fh.readlines()
for start, end in m:
lines = lines[:(start - 1)] + lines[(end+0):]
with open(filename, "wt") as fh:
for li in lines:
fh.write(li)

0 comments on commit 810a89e

Please sign in to comment.