Skip to content

Commit ee75a38

Browse files
authored
[ja,th,vi,zh] combine "ipa" template functions (#1518)
- handle "ca-ipa" template which has partial list nodes - extract more "*-ipa" templates
1 parent 136bb68 commit ee75a38

File tree

8 files changed

+175
-127
lines changed

8 files changed

+175
-127
lines changed

src/wiktextract/extractor/ja/header.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,9 @@ def extract_header_nodes(
5252
or "form-of" in node.attrs.get("class", "")
5353
):
5454
continue
55-
if isinstance(node, HTMLNode) and node.tag in ["small", "i"]:
55+
if (isinstance(node, HTMLNode) and node.tag in ["small", "i"]) or (
56+
isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC
57+
):
5658
raw_tag = clean_node(wxr, None, node).strip("(): ")
5759
if raw_tag != "又は" and raw_tag not in raw_tags:
5860
# ignore "又は"(or) in "ja-noun" template

src/wiktextract/extractor/ja/sound.py

Lines changed: 23 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -58,10 +58,11 @@ def process_sound_template(
5858
) -> None:
5959
if t_node.template_name in ["音声", "audio"]:
6060
extract_audio_template(wxr, t_node, sounds)
61-
elif t_node.template_name in ["IPA", "X-SAMPA", "hi-IPA"]:
61+
elif t_node.template_name in [
62+
"IPA",
63+
"X-SAMPA",
64+
] or t_node.template_name.endswith("-IPA"):
6265
extract_ipa_template(wxr, t_node, sounds)
63-
elif t_node.template_name == "sa-IPA":
64-
extract_ipa_list_template(wxr, t_node, sounds)
6566
elif t_node.template_name == "homophones":
6667
extract_homophones_template(wxr, t_node, sounds)
6768
elif t_node.template_name == "ja-pron":
@@ -102,32 +103,35 @@ def extract_ipa_template(
102103
expanded_node = wxr.wtp.parse(
103104
wxr.wtp.node_to_wikitext(t_node), expand_all=True
104105
)
105-
sounds.extend(extract_ipa_list_item(wxr, expanded_node))
106-
107-
108-
def extract_ipa_list_template(
109-
wxr: WiktextractContext, t_node: TemplateNode, sounds: list[Sound]
110-
):
111-
expanded_node = wxr.wtp.parse(
112-
wxr.wtp.node_to_wikitext(t_node), expand_all=True
113-
)
114-
for list_item in expanded_node.find_child_recursively(NodeKind.LIST_ITEM):
115-
sounds.extend(extract_ipa_list_item(wxr, list_item))
106+
no_list_nodes = []
107+
for node in expanded_node.children:
108+
if isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
109+
for list_item in node.find_child(NodeKind.LIST_ITEM):
110+
sounds.extend(extract_ipa_list_item(wxr, list_item))
111+
else:
112+
no_list_nodes.append(node)
113+
if len(no_list_nodes) > 0:
114+
tmp_node = WikiNode(NodeKind.ROOT, 0)
115+
tmp_node.children = no_list_nodes
116+
sounds.extend(extract_ipa_list_item(wxr, tmp_node))
116117

117118

118119
def extract_ipa_list_item(
119120
wxr: WiktextractContext, list_item: WikiNode
120121
) -> list[Sound]:
121-
raw_tag = ""
122+
raw_tags = []
122123
sounds = []
123124
for span_tag in list_item.find_html_recursively("span"):
124125
span_class = span_tag.attrs.get("class", "").split()
125126
if "qualifier-content" in span_class or "ib-content" in span_class:
126-
raw_tag = clean_node(wxr, None, span_tag)
127+
for raw_tag in clean_node(wxr, None, span_tag).split(","):
128+
raw_tag = raw_tag.strip()
129+
if raw_tag != "":
130+
raw_tags.append(raw_tag)
127131
elif "IPA" in span_class or "SAMPA" in span_class:
128-
sound = Sound(ipa=clean_node(wxr, None, span_tag))
129-
if raw_tag != "":
130-
sound.raw_tags.append(raw_tag)
132+
sound = Sound(
133+
ipa=clean_node(wxr, None, span_tag), raw_tags=raw_tags
134+
)
131135
if sound.ipa != "":
132136
if "SAMPA" in span_class:
133137
sound.ipa = f"/{sound.ipa}/"

src/wiktextract/extractor/ja/tags.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -308,6 +308,8 @@
308308
"廃用": "obsolete",
309309
# Template:ja-noun
310310
"旧字体": "kyūjitai",
311+
# Template:ca-verb
312+
"現在第一人称単数形": ["first-person", "singular", "present"],
311313
}
312314

313315
TOPICS = {

src/wiktextract/extractor/th/sound.py

Lines changed: 38 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -35,24 +35,25 @@ def extract_sound_section(
3535
def extract_sound_template(
3636
wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
3737
):
38-
if t_node.template_name.lower() in ["ipa", "hi-ipa"]:
38+
if t_node.template_name in ["ja-pron", "ja-IPA"]:
39+
extract_ja_pron_template(wxr, base_data, t_node)
40+
elif t_node.template_name == "th-pron":
41+
extract_th_pron_template(wxr, base_data, t_node)
42+
elif t_node.template_name == "lo-pron":
43+
extract_lo_pron_template(wxr, base_data, t_node)
44+
elif t_node.template_name == "zh-pron":
45+
extract_zh_pron_template(wxr, base_data, t_node)
46+
elif (
47+
t_node.template_name.lower() == "ipa"
48+
or t_node.template_name.lower().endswith(("-ipa", "-pron"))
49+
):
3950
extract_ipa_template(wxr, base_data, t_node)
40-
elif t_node.template_name.lower() in ["vi-ipa", "vi-pron", "sa-ipa"]:
41-
extract_vi_ipa_template(wxr, base_data, t_node)
4251
elif t_node.template_name == "X-SAMPA":
4352
extract_x_sampa_template(wxr, base_data, t_node)
4453
elif t_node.template_name == "enPR":
4554
extract_enpr_template(wxr, base_data, t_node)
4655
elif t_node.template_name in ["audio", "Audio", "เสียง"]:
4756
extract_audio_template(wxr, base_data, t_node)
48-
elif t_node.template_name == "th-pron":
49-
extract_th_pron_template(wxr, base_data, t_node)
50-
elif t_node.template_name == "lo-pron":
51-
extract_lo_pron_template(wxr, base_data, t_node)
52-
elif t_node.template_name in ["ja-pron", "ja-IPA"]:
53-
extract_ja_pron_template(wxr, base_data, t_node)
54-
elif t_node.template_name == "zh-pron":
55-
extract_zh_pron_template(wxr, base_data, t_node)
5657
elif t_node.template_name in ["rhymes", "rhyme"]:
5758
extract_rhymes_template(wxr, base_data, t_node)
5859
elif t_node.template_name in ["homophones", "homophone", "hmp"]:
@@ -65,48 +66,52 @@ def extract_ipa_template(
6566
expanded_node = wxr.wtp.parse(
6667
wxr.wtp.node_to_wikitext(t_node), expand_all=True
6768
)
68-
extract_ipa_list_item(wxr, base_data, expanded_node)
69+
no_list_nodes = []
70+
for node in expanded_node.children:
71+
if isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
72+
for list_item in node.find_child(NodeKind.LIST_ITEM):
73+
extract_ipa_list_item(wxr, base_data, list_item)
74+
else:
75+
no_list_nodes.append(node)
76+
if len(no_list_nodes) > 0:
77+
tmp_node = WikiNode(NodeKind.ROOT, 0)
78+
tmp_node.children = no_list_nodes
79+
extract_ipa_list_item(wxr, base_data, tmp_node)
6980
clean_node(wxr, base_data, expanded_node)
7081

7182

7283
def extract_ipa_list_item(
7384
wxr: WiktextractContext, base_data: WordEntry, list_item: WikiNode
7485
):
75-
raw_tag = ""
86+
raw_tags = []
7687
for italic_node in list_item.find_child(NodeKind.ITALIC):
7788
# Template:vi-ipa location data
7889
raw_tag = clean_node(wxr, None, italic_node)
90+
if raw_tag != "":
91+
raw_tags.append(raw_tag)
7992
for span_tag in list_item.find_html_recursively("span"):
8093
span_class = span_tag.attrs.get("class", "").split()
8194
if "qualifier-content" in span_class or "ib-content" in span_class:
82-
raw_tag = clean_node(wxr, None, span_tag)
95+
for raw_tag in clean_node(wxr, None, span_tag).split(","):
96+
raw_tag = raw_tag.strip()
97+
if raw_tag != "":
98+
raw_tags.append(raw_tag)
8399
elif "IPA" in span_class:
84-
sound = Sound(ipa=clean_node(wxr, None, span_tag))
85-
if raw_tag != "":
86-
sound.raw_tags.append(raw_tag)
87-
translate_raw_tags(sound)
100+
sound = Sound(
101+
ipa=clean_node(wxr, None, span_tag), raw_tags=raw_tags
102+
)
88103
if sound.ipa != "":
104+
translate_raw_tags(sound)
89105
base_data.sounds.append(sound)
90106
elif "Latn" in span_class:
91-
sound = Sound(roman=clean_node(wxr, None, span_tag))
92-
if raw_tag != "":
93-
sound.raw_tags.append(raw_tag)
94-
translate_raw_tags(sound)
107+
sound = Sound(
108+
roman=clean_node(wxr, None, span_tag), raw_tags=raw_tags
109+
)
95110
if sound.roman != "":
111+
translate_raw_tags(sound)
96112
base_data.sounds.append(sound)
97113

98114

99-
def extract_vi_ipa_template(
100-
wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
101-
):
102-
expanded_node = wxr.wtp.parse(
103-
wxr.wtp.node_to_wikitext(t_node), expand_all=True
104-
)
105-
for list_item in expanded_node.find_child_recursively(NodeKind.LIST_ITEM):
106-
extract_ipa_list_item(wxr, base_data, list_item)
107-
clean_node(wxr, base_data, expanded_node)
108-
109-
110115
def extract_ja_pron_template(
111116
wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
112117
):

src/wiktextract/extractor/vi/sound.py

Lines changed: 36 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,6 @@ def extract_sound_template(
3838
"audio-for-pron",
3939
]:
4040
extract_pron_audio_template(wxr, base_data, t_node)
41-
elif t_node.template_name in ["tyz-IPA", "hi-IPA", "sa-IPA", "san-IPA"]:
42-
extract_tyz_ipa_template(wxr, base_data, t_node)
4341
elif t_node.template_name in ["zh-pron", "zho-pron"]:
4442
extract_zh_pron_template(wxr, base_data, t_node)
4543
elif t_node.template_name in ["th-pron", "tha-pron"]:
@@ -202,35 +200,6 @@ def extract_audio_template(
202200
base_data.sounds.append(sound)
203201

204202

205-
def extract_tyz_ipa_template(
206-
wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
207-
):
208-
expanded_node = wxr.wtp.parse(
209-
wxr.wtp.node_to_wikitext(t_node), expand_all=True
210-
)
211-
for list in expanded_node.find_child(NodeKind.LIST):
212-
for list_item in list.find_child(NodeKind.LIST_ITEM):
213-
raw_tag = ""
214-
for node in list_item.find_child(NodeKind.ITALIC | NodeKind.LINK):
215-
if node.kind == NodeKind.ITALIC:
216-
raw_tag = clean_node(wxr, None, node)
217-
elif node.kind == NodeKind.LINK:
218-
clean_node(wxr, base_data, node)
219-
for span_tag in list_item.find_html_recursively("span"):
220-
class_names = span_tag.attrs.get("class", "").split()
221-
if "IPA" in class_names:
222-
sound = Sound(ipa=clean_node(wxr, None, span_tag))
223-
if raw_tag != "":
224-
sound.raw_tags.append(raw_tag)
225-
if sound.ipa != "":
226-
translate_raw_tags(sound)
227-
base_data.sounds.append(sound)
228-
elif "label-content" in class_names:
229-
raw_tag = clean_node(wxr, None, span_tag)
230-
for link_node in expanded_node.find_child(NodeKind.LINK):
231-
clean_node(wxr, base_data, link_node)
232-
233-
234203
def extract_ipa_template(
235204
wxr: WiktextractContext,
236205
base_data: WordEntry,
@@ -241,23 +210,46 @@ def extract_ipa_template(
241210
expanded_node = wxr.wtp.parse(
242211
wxr.wtp.node_to_wikitext(t_node), expand_all=True
243212
)
213+
no_list_nodes = []
214+
for node in expanded_node.children:
215+
if isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
216+
for list_item in node.find_child(NodeKind.LIST_ITEM):
217+
extract_ipa_list_item(wxr, base_data, list_item, ipa_class)
218+
else:
219+
no_list_nodes.append(node)
220+
if len(no_list_nodes) > 0:
221+
tmp_node = WikiNode(NodeKind.ROOT, 0)
222+
tmp_node.children = no_list_nodes
223+
extract_ipa_list_item(wxr, base_data, tmp_node, ipa_class)
224+
clean_node(wxr, base_data, expanded_node)
225+
226+
227+
def extract_ipa_list_item(
228+
wxr: WiktextractContext,
229+
base_data: WordEntry,
230+
list_item: WikiNode,
231+
class_name: str,
232+
):
244233
raw_tags = []
245-
for span_tag in expanded_node.find_html("span"):
246-
class_names = span_tag.attrs.get("class", "").split()
247-
if "qualifier-content" in class_names or "label-content" in class_names:
248-
raw_tag = clean_node(wxr, None, span_tag)
249-
if raw_tag != "":
250-
raw_tags.append(raw_tag)
251-
elif ipa_class in class_names:
252-
ipa = clean_node(wxr, None, span_tag)
253-
if ipa != "":
254-
sound = Sound(ipa=ipa, raw_tags=raw_tags)
234+
for italic_node in list_item.find_child(NodeKind.ITALIC):
235+
raw_tag = clean_node(wxr, None, italic_node)
236+
if raw_tag != "":
237+
raw_tags.append(raw_tag)
238+
for span_tag in list_item.find_html_recursively("span"):
239+
span_class = span_tag.attrs.get("class", "").split()
240+
if "qualifier-content" in span_class or "label-content" in span_class:
241+
for raw_tag in clean_node(wxr, None, span_tag).split(","):
242+
raw_tag = raw_tag.strip()
243+
if raw_tag != "":
244+
raw_tags.append(raw_tag)
245+
elif class_name in span_class:
246+
sound = Sound(
247+
ipa=clean_node(wxr, None, span_tag), raw_tags=raw_tags
248+
)
249+
if sound.ipa != "":
255250
translate_raw_tags(sound)
256251
base_data.sounds.append(sound)
257252

258-
for link in expanded_node.find_child(NodeKind.LINK):
259-
clean_node(wxr, base_data, link)
260-
261253

262254
def extract_rhymes_template(
263255
wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode

0 commit comments

Comments
 (0)