-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathParseRAEX.py
executable file
·131 lines (108 loc) · 3.68 KB
/
ParseRAEX.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#!/usr/bin/python3
# Copyright 2024 Yury Gribov
#
# Use of this source code is governed by MIT license that can be
# found in the LICENSE.txt file.
#
# A helper tool to visualize info about public schools in Moscow.
import sys
import os
import os.path
import re
import subprocess
import argparse
def warn(msg):
sys.stderr.write('%s: warning: %s\n' % (me, msg))
def error(msg):
sys.stderr.write('%s: error: %s\n' % (me, msg))
sys.exit(1)
def ensure_module(module, package=None, user=True, quiet=False):
"""
Installs module if it's missing. Call like
ensure_module('configparser')
ensure_module('wx', 'wxPython')
"""
import site
try:
exec('import ' + module)
except ImportError:
if not quiet:
print("Installing Python module %s..." % module)
exe = sys.executable
package = package or module
try:
import pip
except ImportError:
error("install python3-pip")
subprocess.check_call(
[exe, '-mpip', 'install'] + (['--user'] if user else []) + [package])
# User site packages are often not in PATH by default
for d in (site.getusersitepackages() if user else site.getsitepackages()):
if d not in sys.path:
sys.path.append(d)
try:
exec('import ' + module)
except ImportError:
error("module '%s' not found in package '%s'\n" % (module, package))
ensure_module('requests', user=True)
import requests
ensure_module('bs4', user=True)
from bs4 import BeautifulSoup
me = os.path.basename(__file__)
v = 0
def main():
parser = argparse.ArgumentParser(description="A helper tool to convert RAEX rating to text form.",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""\
Examples:
# Рейтинг лучших школ России по конкурентоспособности выпускников
$ python3 {0} https://raex-rr.com/education/best_schools/top-100_russian_schools/2023/
# Рейтинг школ по количеству выпускников, поступивших в ведущие вузы России
$ python3 {0} https://raex-rr.com/education/schools_rating/top-300_schools/2023/
""".format(me))
parser.add_argument('--verbose', '-v',
help="Print diagnostic info (can be specified more than once).",
action='count', default=0)
parser.add_argument('weblink',
help="Path to rating page or HTML file.", metavar='WEBLINK')
args = parser.parse_args()
global v
v = args.verbose
if os.path.exists(args.weblink):
with open(args.weblink) as f:
html = f.read()
else:
html = requests.get(args.weblink).text
s = BeautifulSoup(html, 'html.parser')
# First parse header
toc = {}
for i, th in enumerate(s.table.thead.tr.find_all('th')):
name = th.span.text.strip()
if name in ('Название', 'Школа'):
toc['Name'] = i
elif name in ('Субъект федерации', 'Регион'):
toc['Region'] = i
elif name == 'Город':
toc['City'] = i
elif name == 'Балл':
toc['Rating'] = i
if v:
print('TOC: %s' % toc)
# Then process rows
for tr in s.table.tbody.find_all('tr'):
# Collect fields
row = []
for th in tr.find_all('th'):
row.append(th['data-content'])
for td in tr.find_all('td'):
row.append(td['data-content'])
if v:
print(row)
# Print in text format expected by SchoolTracker.py
name = row[toc['Name']]
region = row[toc['Region']]
city = row[toc['City']]
rating = row[toc['Rating']]
print("%s\t%s\t%s\t%s" % (name, region, city, rating))
if __name__ == '__main__':
sys.exit(main())