[en] Only use a kludge for Chinese when lang_code == zh

kristian-clausal · kristian-clausal · commit 533a134abe2e · 2025-12-01T09:38:26.000+02:00
CURRENTLY FAILS NEW TEST, I DON'T KNOW WHY Fixes #1520 A kludge for Chinese etymology and pronunciation sections and subsequent changes to them in PR #1303 breaks the data-stuffing of etymology sections with level-4 prons that are preceded by similar sections. Prons from the previous etymology are put into the following one, twice (before and after).
diff --git a/src/wiktextract/extractor/en/page.py b/src/wiktextract/extractor/en/page.py
@@ -3255,7 +3255,8 @@ def skip_template_fn(name: str, ht: TemplateArgs) -> Optional[str]:
                 # be level 4 instead of 3 so that they're part of a larger
                 # etymology hierarchy; usually the data here is empty and
                 # acts as an inbetween between POS and Etymology data
-                inside_level_four = True
+                if lang_code in ("zh",):
+                    inside_level_four = True
                 if t.startswith(PRONUNCIATION_TITLE + " "):
                     # Pronunciation 1, etc, are used in Chinese Glyphs,
                     # and each of them may have senses under Definition
diff --git a/tests/test_en_pronunciation.py b/tests/test_en_pronunciation.py
@@ -710,6 +710,42 @@ def test_sound_before_etymology(self):
         self.assertEqual(data[0]["sounds"], data[1]["sounds"])
         self.assertEqual(data[0]["sounds"], data[2]["sounds"])
 
+    def test_sound_inside_etymology(self):
+        self.wxr.wtp.add_page(
+            "Template:IPA",
+            10,
+            """[[Wiktionary:International Phonetic Alphabet|IPA]]<sup>([[Appendix:English pronunciation|key]])</sup>:&#32;<span class="IPA">/ˈtiː/</span>[[Category:English 1-syllable words|TEE]][[Category:English terms with IPA pronunciation|TEE]]""",
+        )
+        data = parse_page(
+            self.wxr,
+            "tee",
+            """==English==
+
+===Etymology 1===
+Etymology 1
+====Pronunciation====
+* {{IPA|en|/ˈtiː/}}
+====Noun====
+# The name of the Latin-script letter T/t.
+====Verb====
+# To redirect output to multiple destinations.
+
+===Etymology 2===
+Etymology 2
+====Pronunciation====
+* {{IPA|en|/ˈtaː/}}
+====Noun====
+# A flat area of ground""",
+        )
+        print(data)
+        self.assertEqual(data[0]["etymology_text"], "Etymology 1")
+        self.assertEqual(data[0]["etymology_text"], data[1]["etymology_text"])
+        self.assertEqual(data[2]["etymology_text"], "Etymology 2")
+        self.assertEqual(data[0]["sounds"], [{"ipa": "/ˈtiː/"}])
+        self.assertEqual(data[2]["sounds"], [{"ipa": "/ˈtaː/"}])
+        self.assertEqual(data[0]["sounds"], data[1]["sounds"])
+        self.assertNotEqual(data[0]["sounds"], data[2]["sounds"])
+
     def test_zh_pron_nested_parentheses(self):
         self.wxr.wtp.add_page(
             "Template:zh-pron",