-
Notifications
You must be signed in to change notification settings - Fork 0
/
read_dict_data.py
189 lines (150 loc) · 8.45 KB
/
read_dict_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
import json
import os
import logging
from scripts.utils import exit_program
from scripts.utils import read_and_process_json
from scripts.utils import format_markdown
from scripts.utils import is_phrase
from scripts.utils_md import get_dir
# Configure logging
logging.basicConfig(filename='assets/logs/reading_dict_jsons.log',
level=logging.DEBUG,
format='%(asctime)s - %(levelname)s - %(message)s')
def interpret_word_data(text_data, data, the_type, snake_case):
try:
'''
words_output_dir
phrases_output_dir
'''
# Determine the directory for words or phrases
output_dir = phrases_output_dir if is_phrase(text_data) else words_output_dir
# Create the Markdown file name based on the word or phrase
file_name_output = f'md/{the_type}s/{snake_case}.md'
has_md = False
md_path = '' # Default empty path for markdown
# Check if the data is a list of strings (no markdown generation)
if isinstance(data, list) and all(isinstance(item, str) for item in data):
has_md = False
md_path = ''
else:
# Generate markdown for structured data
with open(file_name_output, 'w', encoding='utf-8') as f:
has_md = True
md_path = file_name_output
for entry in data:
# Ensure that the entry is a dictionary
if isinstance(entry, dict):
word = entry.get('meta', {}).get('id', 'N/A')
clean_word = format_markdown(word)
f.write(f'# {clean_word}\n\n')
part_of_speech = entry.get('fl', 'N/A')
pronunciations = entry.get('hwi', {}).get('prs', [])
pronunciation = pronunciations[0].get('mw', 'N/A') if pronunciations else 'N/A'
audio_ref = pronunciations[0].get('sound', {}).get('audio', 'N/A') if pronunciations else 'N/A'
clean_speech = format_markdown(part_of_speech)
clean_pron = format_markdown(pronunciation)
clean_ref = format_markdown(audio_ref)
f.write(f'**Part of Speech:** {clean_speech}\n\n')
f.write(f'**Pronunciation:** {clean_pron}\n\n')
f.write(f'**Audio Reference:** {clean_ref}\n\n')
# Process definitions
if 'def' in entry:
f.write('## Definitions:\n')
for def_group in entry['def']:
for sense_group in def_group['sseq']:
for sense in sense_group:
if 'sense' in sense[0]:
definition_text = sense[1]['dt'][0][1] if 'dt' in sense[1] else 'N/A'
example_text = sense[1]['dt'][1][1][0]['t'] if len(sense[1].get('dt', [])) > 1 else None
clean_definition = format_markdown(definition_text)
f.write(f'- {clean_definition}\n')
if example_text:
clean_example = format_markdown(example_text)
f.write(f' *Example:* {clean_example}\n')
# Short definitions
if 'shortdef' in entry:
f.write('\n## Short Definitions:\n')
for short_def in entry['shortdef']:
clean_shortdef = format_markdown(short_def)
f.write(f'- {clean_shortdef}\n')
# Synonyms
if 'syns' in entry:
f.write('\n## Synonyms:\n')
for synonym_group in entry['syns']:
for synonym in synonym_group.get('pt', []):
if 'text' in synonym[0]:
clean_text = format_markdown(f'{synonym[0]['text']}')
f.write(f'- {clean_text}\n')
# Related forms
if 'uros' in entry:
f.write('\n## Related Forms:\n')
for form in entry['uros']:
related_word = form.get('ure', 'N/A')
related_pronunciation = form.get('prs', [{}])[0].get('mw', 'N/A')
clean_text = format_markdown(f'{related_word} ({related_pronunciation})')
f.write(f'- {clean_text}\n')
else:
logging.error(f'Unexpected entry format: {entry}')
return has_md, md_path # Return markdown status and path
except Exception as e:
logging.error(f'Error generating markdown for {text_data}: {e}')
return False, ''
def process_all_json_files_in_directory(words_file, mistakes_file):
# words_md_dir = 'md/words/' # Output directory for word markdown files
# phrases_md_dir = 'md/phrases/' # Output directory for phrase markdown files
# json_directory = 'data/dict/' # Directory containing dictionary .json files
# directory
# words_file
# mistakes_file
# words_output_dir
# phrases_output_dir
words_data = read_and_process_json(words_file)
mistakes_data = read_and_process_json(mistakes_file)
if not words_data:
logging.error(f'Error: Could not read JSON file: {words_file}')
exit_program()
for item in words_data:
word_or_phrase = item['text']
dict_filename = item['dict_json']
the_type = item['type']
dict_file_path = get_dir(filename=dict_filename,
type=the_type,
for_json_dict=True)
if os.path.exists(dict_file_path):
try:
with open(dict_file_path, 'r', encoding='utf-8') as file:
json_data = json.load(file)
# Interpret the word data and generate markdown if needed
has_md, md_path = interpret_word_data(
word_or_phrase,
json_data,
the_type,
snake_case=dict_filename.replace('.json', '')
)
# Update the word entry with dict_path, mistake, has_md, and md_path
item['dict_path'] = dict_file_path
item['has_md'] = has_md
item['md_path'] = md_path if has_md else ''
# Check if the word or phrase exists in mistakes.json
item['mistake'] = any(m['text'] == word_or_phrase for m in mistakes_data)
except Exception as e:
logging.error(f'Error processing {dict_filename}: {e}')
else:
logging.warning(f'File not found: {dict_file_path}')
# Write the updated words_data back to the words.json file
with open(words_file, 'w', encoding='utf-8') as json_outfile:
json.dump(words_data, json_outfile, indent=4)
print(f'All files processed and {words_file} updated successfully.')
'''
words_md_dir = 'md/words/' # Output directory for word markdown files
phrases_md_dir = 'md/phrases/' # Output directory for phrase markdown files
json_directory = 'data/dict/' # Directory containing dictionary .json files
Ensure directories exist
os.makedirs(words_md_dir, exist_ok=True)
os.makedirs(phrases_md_dir, exist_ok=True)
'''
# Define directories and file paths
words_json_file = 'data/words.json' # Path to words.json file
mistakes_json_file = 'data/mistakes.json' # Path to mistakes.json file
# Process the dictionary files and update words.json
process_all_json_files_in_directory(words_json_file, mistakes_json_file)