Skip to content

Commit 2e08598

Browse files
[en] Only use a kludge for Chinese when lang_code == zh
Fixes #1520 A kludge for Chinese etymology and pronunciation sections and subsequent changes to them in PR #1303 breaks the data-stuffing of etymology sections with level-4 prons that are preceded by similar sections. Prons from the previous etymology are put into the following one, twice (before and after).
1 parent 452b35a commit 2e08598

File tree

2 files changed

+43
-1
lines changed

2 files changed

+43
-1
lines changed

src/wiktextract/extractor/en/page.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3255,7 +3255,8 @@ def skip_template_fn(name: str, ht: TemplateArgs) -> Optional[str]:
32553255
# be level 4 instead of 3 so that they're part of a larger
32563256
# etymology hierarchy; usually the data here is empty and
32573257
# acts as an inbetween between POS and Etymology data
3258-
inside_level_four = True
3258+
if lang_code in ("zh",):
3259+
inside_level_four = True
32593260
if t.startswith(PRONUNCIATION_TITLE + " "):
32603261
# Pronunciation 1, etc, are used in Chinese Glyphs,
32613262
# and each of them may have senses under Definition

tests/test_en_pronunciation.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -710,6 +710,47 @@ def test_sound_before_etymology(self):
710710
self.assertEqual(data[0]["sounds"], data[1]["sounds"])
711711
self.assertEqual(data[0]["sounds"], data[2]["sounds"])
712712

713+
def test_sound_inside_etymology(self):
714+
self.wxr.wtp.add_page(
715+
"Template:IPA",
716+
10,
717+
"""[[Wiktionary:International Phonetic Alphabet|IPA]]<sup>([[Appendix:English pronunciation|key]])</sup>:&#32;<span class="IPA">/ˈtiː/</span>[[Category:English 1-syllable words|TEE]][[Category:English terms with IPA pronunciation|TEE]]""",
718+
)
719+
self.wxr.wtp.add_page(
720+
"Template:IPA2",
721+
10,
722+
"""[[Wiktionary:International Phonetic Alphabet|IPA]]<sup>([[Appendix:English pronunciation|key]])</sup>:&#32;<span class="IPA">/ˈtaː/</span>[[Category:English 1-syllable words|TEE]][[Category:English terms with IPA pronunciation|TEE]]""",
723+
)
724+
data = parse_page(
725+
self.wxr,
726+
"tee",
727+
"""==English==
728+
729+
===Etymology 1===
730+
Etymology 1
731+
====Pronunciation====
732+
* {{IPA|en|/ˈtiː/}}
733+
====Noun====
734+
# The name of the Latin-script letter T/t.
735+
====Verb====
736+
# To redirect output to multiple destinations.
737+
738+
===Etymology 2===
739+
Etymology 2
740+
====Pronunciation====
741+
* {{IPA2|en|/ˈtaː/}}
742+
====Noun====
743+
# A flat area of ground""",
744+
)
745+
print(data)
746+
self.assertEqual(data[0]["etymology_text"], "Etymology 1")
747+
self.assertEqual(data[0]["etymology_text"], data[1]["etymology_text"])
748+
self.assertEqual(data[2]["etymology_text"], "Etymology 2")
749+
self.assertEqual(data[0]["sounds"], [{"ipa": "/ˈtiː/"}])
750+
self.assertEqual(data[2]["sounds"], [{"ipa": "/ˈtaː/"}])
751+
self.assertEqual(data[0]["sounds"], data[1]["sounds"])
752+
self.assertNotEqual(data[0]["sounds"], data[2]["sounds"])
753+
713754
def test_zh_pron_nested_parentheses(self):
714755
self.wxr.wtp.add_page(
715756
"Template:zh-pron",

0 commit comments

Comments
 (0)