@@ -35,24 +35,25 @@ def extract_sound_section(
3535def extract_sound_template (
3636 wxr : WiktextractContext , base_data : WordEntry , t_node : TemplateNode
3737):
38- if t_node .template_name .lower () in ["ipa" , "hi-ipa" ]:
38+ if t_node .template_name in ["ja-pron" , "ja-IPA" ]:
39+ extract_ja_pron_template (wxr , base_data , t_node )
40+ elif t_node .template_name == "th-pron" :
41+ extract_th_pron_template (wxr , base_data , t_node )
42+ elif t_node .template_name == "lo-pron" :
43+ extract_lo_pron_template (wxr , base_data , t_node )
44+ elif t_node .template_name == "zh-pron" :
45+ extract_zh_pron_template (wxr , base_data , t_node )
46+ elif (
47+ t_node .template_name .lower () == "ipa"
48+ or t_node .template_name .lower ().endswith (("-ipa" , "-pron" ))
49+ ):
3950 extract_ipa_template (wxr , base_data , t_node )
40- elif t_node .template_name .lower () in ["vi-ipa" , "vi-pron" , "sa-ipa" ]:
41- extract_vi_ipa_template (wxr , base_data , t_node )
4251 elif t_node .template_name == "X-SAMPA" :
4352 extract_x_sampa_template (wxr , base_data , t_node )
4453 elif t_node .template_name == "enPR" :
4554 extract_enpr_template (wxr , base_data , t_node )
4655 elif t_node .template_name in ["audio" , "Audio" , "เสียง" ]:
4756 extract_audio_template (wxr , base_data , t_node )
48- elif t_node .template_name == "th-pron" :
49- extract_th_pron_template (wxr , base_data , t_node )
50- elif t_node .template_name == "lo-pron" :
51- extract_lo_pron_template (wxr , base_data , t_node )
52- elif t_node .template_name in ["ja-pron" , "ja-IPA" ]:
53- extract_ja_pron_template (wxr , base_data , t_node )
54- elif t_node .template_name == "zh-pron" :
55- extract_zh_pron_template (wxr , base_data , t_node )
5657 elif t_node .template_name in ["rhymes" , "rhyme" ]:
5758 extract_rhymes_template (wxr , base_data , t_node )
5859 elif t_node .template_name in ["homophones" , "homophone" , "hmp" ]:
@@ -65,48 +66,52 @@ def extract_ipa_template(
6566 expanded_node = wxr .wtp .parse (
6667 wxr .wtp .node_to_wikitext (t_node ), expand_all = True
6768 )
68- extract_ipa_list_item (wxr , base_data , expanded_node )
69+ no_list_nodes = []
70+ for node in expanded_node .children :
71+ if isinstance (node , WikiNode ) and node .kind == NodeKind .LIST :
72+ for list_item in node .find_child (NodeKind .LIST_ITEM ):
73+ extract_ipa_list_item (wxr , base_data , list_item )
74+ else :
75+ no_list_nodes .append (node )
76+ if len (no_list_nodes ) > 0 :
77+ tmp_node = WikiNode (NodeKind .ROOT , 0 )
78+ tmp_node .children = no_list_nodes
79+ extract_ipa_list_item (wxr , base_data , tmp_node )
6980 clean_node (wxr , base_data , expanded_node )
7081
7182
7283def extract_ipa_list_item (
7384 wxr : WiktextractContext , base_data : WordEntry , list_item : WikiNode
7485):
75- raw_tag = ""
86+ raw_tags = []
7687 for italic_node in list_item .find_child (NodeKind .ITALIC ):
7788 # Template:vi-ipa location data
7889 raw_tag = clean_node (wxr , None , italic_node )
90+ if raw_tag != "" :
91+ raw_tags .append (raw_tag )
7992 for span_tag in list_item .find_html_recursively ("span" ):
8093 span_class = span_tag .attrs .get ("class" , "" ).split ()
8194 if "qualifier-content" in span_class or "ib-content" in span_class :
82- raw_tag = clean_node (wxr , None , span_tag )
95+ for raw_tag in clean_node (wxr , None , span_tag ).split ("," ):
96+ raw_tag = raw_tag .strip ()
97+ if raw_tag != "" :
98+ raw_tags .append (raw_tag )
8399 elif "IPA" in span_class :
84- sound = Sound (ipa = clean_node (wxr , None , span_tag ))
85- if raw_tag != "" :
86- sound .raw_tags .append (raw_tag )
87- translate_raw_tags (sound )
100+ sound = Sound (
101+ ipa = clean_node (wxr , None , span_tag ), raw_tags = raw_tags
102+ )
88103 if sound .ipa != "" :
104+ translate_raw_tags (sound )
89105 base_data .sounds .append (sound )
90106 elif "Latn" in span_class :
91- sound = Sound (roman = clean_node (wxr , None , span_tag ))
92- if raw_tag != "" :
93- sound .raw_tags .append (raw_tag )
94- translate_raw_tags (sound )
107+ sound = Sound (
108+ roman = clean_node (wxr , None , span_tag ), raw_tags = raw_tags
109+ )
95110 if sound .roman != "" :
111+ translate_raw_tags (sound )
96112 base_data .sounds .append (sound )
97113
98114
99- def extract_vi_ipa_template (
100- wxr : WiktextractContext , base_data : WordEntry , t_node : TemplateNode
101- ):
102- expanded_node = wxr .wtp .parse (
103- wxr .wtp .node_to_wikitext (t_node ), expand_all = True
104- )
105- for list_item in expanded_node .find_child_recursively (NodeKind .LIST_ITEM ):
106- extract_ipa_list_item (wxr , base_data , list_item )
107- clean_node (wxr , base_data , expanded_node )
108-
109-
110115def extract_ja_pron_template (
111116 wxr : WiktextractContext , base_data : WordEntry , t_node : TemplateNode
112117):
0 commit comments