Skip to content

Commit 533a134

Browse files
[en] Only use a kludge for Chinese when lang_code == zh
CURRENTLY FAILS NEW TEST, I DON'T KNOW WHY Fixes #1520 A kludge for Chinese etymology and pronunciation sections and subsequent changes to them in PR #1303 breaks the data-stuffing of etymology sections with level-4 prons that are preceded by similar sections. Prons from the previous etymology are put into the following one, twice (before and after).
1 parent 452b35a commit 533a134

File tree

2 files changed

+38
-1
lines changed

2 files changed

+38
-1
lines changed

src/wiktextract/extractor/en/page.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3255,7 +3255,8 @@ def skip_template_fn(name: str, ht: TemplateArgs) -> Optional[str]:
32553255
# be level 4 instead of 3 so that they're part of a larger
32563256
# etymology hierarchy; usually the data here is empty and
32573257
# acts as an inbetween between POS and Etymology data
3258-
inside_level_four = True
3258+
if lang_code in ("zh",):
3259+
inside_level_four = True
32593260
if t.startswith(PRONUNCIATION_TITLE + " "):
32603261
# Pronunciation 1, etc, are used in Chinese Glyphs,
32613262
# and each of them may have senses under Definition

tests/test_en_pronunciation.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -710,6 +710,42 @@ def test_sound_before_etymology(self):
710710
self.assertEqual(data[0]["sounds"], data[1]["sounds"])
711711
self.assertEqual(data[0]["sounds"], data[2]["sounds"])
712712

713+
def test_sound_inside_etymology(self):
714+
self.wxr.wtp.add_page(
715+
"Template:IPA",
716+
10,
717+
"""[[Wiktionary:International Phonetic Alphabet|IPA]]<sup>([[Appendix:English pronunciation|key]])</sup>:&#32;<span class="IPA">/ˈtiː/</span>[[Category:English 1-syllable words|TEE]][[Category:English terms with IPA pronunciation|TEE]]""",
718+
)
719+
data = parse_page(
720+
self.wxr,
721+
"tee",
722+
"""==English==
723+
724+
===Etymology 1===
725+
Etymology 1
726+
====Pronunciation====
727+
* {{IPA|en|/ˈtiː/}}
728+
====Noun====
729+
# The name of the Latin-script letter T/t.
730+
====Verb====
731+
# To redirect output to multiple destinations.
732+
733+
===Etymology 2===
734+
Etymology 2
735+
====Pronunciation====
736+
* {{IPA|en|/ˈtaː/}}
737+
====Noun====
738+
# A flat area of ground""",
739+
)
740+
print(data)
741+
self.assertEqual(data[0]["etymology_text"], "Etymology 1")
742+
self.assertEqual(data[0]["etymology_text"], data[1]["etymology_text"])
743+
self.assertEqual(data[2]["etymology_text"], "Etymology 2")
744+
self.assertEqual(data[0]["sounds"], [{"ipa": "/ˈtiː/"}])
745+
self.assertEqual(data[2]["sounds"], [{"ipa": "/ˈtaː/"}])
746+
self.assertEqual(data[0]["sounds"], data[1]["sounds"])
747+
self.assertNotEqual(data[0]["sounds"], data[2]["sounds"])
748+
713749
def test_zh_pron_nested_parentheses(self):
714750
self.wxr.wtp.add_page(
715751
"Template:zh-pron",

0 commit comments

Comments
 (0)