Skip to content

Commit 64c6f4a

Browse files
authored
Merge pull request #15 from explosion/feature/doc-markdown
Add Doc._.markdown
2 parents 9647ce6 + 0d06079 commit 64c6f4a

File tree

5 files changed

+18
-2
lines changed

5 files changed

+18
-2
lines changed

README.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@ print(doc.text)
4040
print(doc._.layout)
4141
# Tables in the document and their extracted data
4242
print(doc._.tables)
43+
# Markdown representation of the document
44+
print(doc._.markdown)
4345

4446
# Layout spans for different sections
4547
for span in doc.spans["layout"]:
@@ -114,6 +116,7 @@ for span in doc.spans["layout"]:
114116
| `Doc._.layout` | `DocLayout` | Layout features of the document. |
115117
| `Doc._.pages` | `list[tuple[PageLayout, list[Span]]]` | Pages in the document and the spans they contain. |
116118
| `Doc._.tables` | `list[Span]` | All tables in the document. |
119+
| `Doc._.markdown` | `str` | Markdown representation of the document. |
117120
| `Doc.spans["layout"]` | `spacy.tokens.SpanGroup` | The layout spans in the document. |
118121
| `Span.label_` | `str` | The type of the extracted layout span, e.g. `"text"` or `"section_header"`. [See here](https://github.com/DS4SD/docling-core/blob/14cad33ae7f8dc011a79dd364361d2647c635466/docling_core/types/doc/labels.py) for options. |
119122
| `Span.label` | `int` | The integer ID of the span label. |
@@ -161,7 +164,7 @@ layout = spaCyLayout(nlp)
161164
| --- | --- | --- |
162165
| `nlp` | `spacy.language.Language` | The initialized `nlp` object to use for tokenization. |
163166
| `separator` | `str` | Token used to separate sections in the created `Doc` object. The separator won't be part of the layout span. If `None`, no separator will be added. Defaults to `"\n\n"`. |
164-
| `attrs` | `dict[str, str]` | Override the custom spaCy attributes. Can include `"doc_layout"`, `"doc_pages"`, `"doc_tables"`, `"span_layout"`, `"span_data"`, `"span_heading"` and `"span_group"`. |
167+
| `attrs` | `dict[str, str]` | Override the custom spaCy attributes. Can include `"doc_layout"`, `"doc_pages"`, `"doc_tables"`, `"doc_markdown"`, `"span_layout"`, `"span_data"`, `"span_heading"` and `"span_group"`. |
165168
| `headings` | `list[str]` | Labels of headings to consider for `Span._.heading` detection. Defaults to `["section_header", "page_header", "title"]`. |
166169
| `display_table` | `Callable[[pandas.DataFrame], str] \| str` | Function to generate the text-based representation of the table in the `Doc.text` or placeholder text. Defaults to `"TABLE"`. |
167170
| `docling_options` | `dict[InputFormat, FormatOption]` | [Format options](https://ds4sd.github.io/docling/usage/#advanced-options) passed to Docling's `DocumentConverter`. |

setup.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
[metadata]
2-
version = 0.0.8
2+
version = 0.0.9
33
description = Use spaCy with PDFs, Word docs and other documents
44
url = https://github.com/explosion/spacy-layout
55
author = Explosion

spacy_layout/layout.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ def __init__(
4848
doc_layout=attrs.get("doc_layout", "layout"),
4949
doc_pages=attrs.get("doc_pages", "pages"),
5050
doc_tables=attrs.get("doc_tables", "tables"),
51+
doc_markdown=attrs.get("doc_markdown", "markdown"),
5152
span_layout=attrs.get("span_layout", "layout"),
5253
span_heading=attrs.get("span_heading", "heading"),
5354
span_data=attrs.get("span_data", "data"),
@@ -60,6 +61,7 @@ def __init__(
6061
Doc.set_extension(self.attrs.doc_layout, default=None, force=True)
6162
Doc.set_extension(self.attrs.doc_pages, getter=self.get_pages, force=True)
6263
Doc.set_extension(self.attrs.doc_tables, getter=self.get_tables, force=True)
64+
Doc.set_extension(self.attrs.doc_markdown, default=None, force=True)
6365
Span.set_extension(self.attrs.span_layout, default=None, force=True)
6466
Span.set_extension(self.attrs.span_data, default=None, force=True)
6567
Span.set_extension(self.attrs.span_heading, getter=self.get_heading, force=True)
@@ -109,6 +111,7 @@ def _result_to_doc(self, result: "ConversionResult") -> Doc:
109111
inputs.append((table_text, item))
110112
doc = self._texts_to_doc(inputs, pages)
111113
doc._.set(self.attrs.doc_layout, DocLayout(pages=[p for p in pages.values()]))
114+
doc._.set(self.attrs.doc_markdown, result.document.export_to_markdown())
112115
return doc
113116

114117
def _texts_to_doc(

spacy_layout/types.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ class Attrs:
1717
doc_layout: str
1818
doc_pages: str
1919
doc_tables: str
20+
doc_markdown: str
2021
span_layout: str
2122
span_data: str
2223
span_heading: str

tests/test_general.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,15 @@ def test_table(nlp):
7575
"Chernihiv, Ukraine",
7676
],
7777
}
78+
markdown = (
79+
"| Name | Type | Place of birth |\n"
80+
"|------------------|--------|--------------------|\n"
81+
"| Ines | human | Cologne, Germany |\n"
82+
"| Matt | human | Sydney, Australia |\n"
83+
"| Baikal | cat | Berlin, Germany |\n"
84+
"| Stanislav Petrov | cat | Chernihiv, Ukraine |\n"
85+
)
86+
assert markdown in doc._.get(layout.attrs.doc_markdown)
7887

7988

8089
def test_table_placeholder(nlp):

0 commit comments

Comments
 (0)