Skip to content

Commit 2be7a75

Browse files
authored
[de] extract article data to Form.article field (#1545)
only do this for the "Deutsch Substantiv Übersicht" table template
1 parent 606a11c commit 2be7a75

File tree

3 files changed

+45
-9
lines changed

3 files changed

+45
-9
lines changed

src/wiktextract/extractor/de/inflection.py

Lines changed: 29 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -54,11 +54,11 @@ class RowspanHeader:
5454

5555

5656
def process_verb_table(
57-
wxr: WiktextractContext, word_entry: WordEntry, template_node: TemplateNode
57+
wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
5858
) -> None:
5959
# Vorlage:Deutsch Verb Übersicht
6060
expanded_template = wxr.wtp.parse(
61-
wxr.wtp.node_to_wikitext(template_node), expand_all=True
61+
wxr.wtp.node_to_wikitext(t_node), expand_all=True
6262
)
6363
table_nodes = list(expanded_template.find_child(NodeKind.TABLE))
6464
if len(table_nodes) == 0:
@@ -137,20 +137,22 @@ def process_verb_table(
137137

138138

139139
def process_noun_table(
140-
wxr: WiktextractContext, word_entry: WordEntry, template_node: TemplateNode
140+
wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
141141
) -> None:
142142
# Vorlage:Deutsch Substantiv Übersicht
143143
from .page import extract_note_section
144144

145145
expanded_template = wxr.wtp.parse(
146-
wxr.wtp.node_to_wikitext(template_node), expand_all=True
146+
wxr.wtp.node_to_wikitext(t_node), expand_all=True
147147
)
148148
table_nodes = list(expanded_template.find_child(NodeKind.TABLE))
149149
if len(table_nodes) == 0:
150150
return
151151
table_node = table_nodes[0]
152152
column_headers = []
153153
table_header = ""
154+
forms = []
155+
flexion_pages = []
154156
for table_row in table_node.find_child(NodeKind.TABLE_ROW):
155157
row_header = ""
156158
is_header_row = not table_row.contain_node(NodeKind.TABLE_CELL)
@@ -188,7 +190,7 @@ def process_noun_table(
188190
for link_node in table_cell.find_child(NodeKind.LINK):
189191
link_text = clean_node(wxr, None, link_node)
190192
if link_text.startswith("Flexion:"):
191-
parse_flexion_page(wxr, word_entry, link_text)
193+
flexion_pages.append(link_text)
192194
else:
193195
for form_text in cell_text.splitlines():
194196
form_text = form_text.strip()
@@ -209,10 +211,15 @@ def process_noun_table(
209211
):
210212
form.raw_tags.append(col_header.text)
211213
translate_raw_tags(form)
212-
word_entry.forms.append(form)
214+
forms.append(form)
213215
col_index += 1
214216

217+
if t_node.template_name == "Deutsch Substantiv Übersicht":
218+
forms = seprarte_de_article(wxr, forms)
219+
word_entry.forms.extend(forms)
215220
clean_node(wxr, word_entry, expanded_template) # category links
221+
for flexion_page in flexion_pages:
222+
parse_flexion_page(wxr, word_entry, flexion_page)
216223
# Vorlage:Deutsch Nachname Übersicht
217224
for level_node in expanded_template.find_child(NodeKind.LEVEL4):
218225
section_text = clean_node(wxr, None, level_node.largs)
@@ -326,3 +333,19 @@ def extract_pronoun_table(
326333
word_entry.forms.append(form)
327334
article = ""
328335
col_index += 1
336+
337+
338+
def seprarte_de_article(
339+
wxr: WiktextractContext, forms: list[Form]
340+
) -> list[Form]:
341+
# https://de.wiktionary.org/wiki/Vorlage:Deutsch_Substantiv_Übersicht
342+
# https://en.wikipedia.org/wiki/German_articles
343+
new_forms = []
344+
for form in forms:
345+
m = re.match(r"(der|die|das|den|dem|des)\s+", form.form)
346+
if m is not None:
347+
form.form = form.form[m.end() :]
348+
form.article = m.group(1)
349+
if form.form != wxr.wtp.title:
350+
new_forms.append(form)
351+
return new_forms

src/wiktextract/extractor/de/models.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@ class Form(GermanBaseModel):
101101
sense_index: str = ""
102102
topics: list[str] = []
103103
pronouns: list[str] = []
104+
article: str = ""
104105

105106

106107
class Descendant(GermanBaseModel):

tests/test_de_forms.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,9 +47,21 @@ def test_noun_table(self):
4747
self.assertEqual(
4848
word_entry.model_dump(exclude_defaults=True)["forms"],
4949
[
50-
{"form": "des Wörterbuches", "tags": ["genitive", "singular"]},
51-
{"form": "des Wörterbuchs", "tags": ["genitive", "singular"]},
52-
{"form": "der Wörterbücher", "tags": ["genitive", "plural"]},
50+
{
51+
"article": "des",
52+
"form": "Wörterbuches",
53+
"tags": ["genitive", "singular"],
54+
},
55+
{
56+
"article": "des",
57+
"form": "Wörterbuchs",
58+
"tags": ["genitive", "singular"],
59+
},
60+
{
61+
"article": "der",
62+
"form": "Wörterbücher",
63+
"tags": ["genitive", "plural"],
64+
},
5365
],
5466
)
5567

0 commit comments

Comments
 (0)