Skip to content

Commit 17333a0

Browse files
authored
Merge pull request #271 from chartbeat-labs/feature/add-udhr-dataset
Add UDHR dataset
2 parents 7500d17 + 6afd092 commit 17333a0

File tree

14 files changed

+393
-73
lines changed

14 files changed

+393
-73
lines changed

docs/source/api_reference/datasets.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,4 +15,6 @@ Datasets
1515

1616
.. automodule:: textacy.datasets.imdb
1717

18+
.. automodule:: textacy.datasets.udhr
19+
1820
.. automodule:: textacy.datasets.utils

tests/datasets/test_capitol_words.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515

1616
@pytest.mark.skip("No need to download a new dataset every time")
1717
def test_download(tmpdir):
18-
dataset = CapitolWords(data_dir=str(tempdir))
18+
dataset = CapitolWords(data_dir=str(tmpdir))
1919
dataset.download()
2020
assert os.path.isfile(dataset._filepath)
2121

tests/datasets/test_imdb.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ def _skipif():
2626
def test_download(tmpdir):
2727
dataset = IMDB(data_dir=str(tmpdir))
2828
dataset.download()
29-
assert all(os.path.isfile(filepath) for filepath in dataset.filepaths)
29+
assert dataset.data_dir.joinpath("aclImdb").is_dir()
3030

3131

3232
def test_oserror(tmpdir):

tests/datasets/test_oxford_text_archive.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515

1616
@pytest.mark.skip("No need to download a new dataset every time")
1717
def test_download(tmpdir):
18-
dataset = OxfordTextArchive(data_dir=str(tempdir))
18+
dataset = OxfordTextArchive(data_dir=str(tmpdir))
1919
dataset.download()
2020
assert os.path.isfile(dataset._metadata_filepath)
2121
assert os.path.isdir(dataset._text_dirpath)

tests/datasets/test_udhr.py

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
import os
2+
3+
import pytest
4+
5+
import textacy
6+
from textacy.datasets.udhr import UDHR
7+
8+
DATASET = UDHR()
9+
10+
11+
def _skipif():
12+
try:
13+
DATASET._check_data()
14+
return False
15+
except OSError:
16+
return True
17+
18+
19+
pytestmark = pytest.mark.skipif(
20+
_skipif(),
21+
reason="UDHR dataset must be downloaded before running tests",
22+
)
23+
24+
25+
@pytest.mark.skip("No need to download a new dataset every time")
26+
def test_download(tmpdir):
27+
dataset = UDHR(data_dir=str(tmpdir))
28+
dataset.download()
29+
assert os.path.isfile(dataset._index_filepath)
30+
assert os.path.isdir(dataset._texts_dirpath)
31+
32+
33+
def test_oserror(tmpdir):
34+
dataset = UDHR(data_dir=str(tmpdir))
35+
with pytest.raises(OSError):
36+
_ = list(dataset.texts())
37+
38+
39+
def test_texts():
40+
texts = list(DATASET.texts(limit=3))
41+
assert len(texts) > 0
42+
for text in texts:
43+
assert isinstance(text, str)
44+
45+
46+
def test_texts_limit():
47+
for limit in (1, 5, 10):
48+
assert sum(1 for _ in DATASET.texts(limit=limit)) == limit
49+
50+
51+
def test_records():
52+
for text, meta in DATASET.records(limit=3):
53+
assert isinstance(text, str)
54+
assert isinstance(meta, dict)
55+
56+
57+
def test_records_lang():
58+
langs = ({"en"}, {"en", "es"})
59+
for lang in langs:
60+
records = list(DATASET.records(lang=lang, limit=10))
61+
assert all(meta["lang"] in lang for _, meta in records)
62+
63+
64+
def test_bad_filters():
65+
bad_filters = (
66+
{"lang": "xx"},
67+
{"lang": ["en", "un"]},
68+
)
69+
for bad_filter in bad_filters:
70+
with pytest.raises(ValueError):
71+
list(DATASET.texts(**bad_filter))
72+
bad_filters = (
73+
{"lang": True},
74+
{"lang": textacy.load_spacy_lang("en")},
75+
)
76+
for bad_filter in bad_filters:
77+
with pytest.raises(TypeError):
78+
list(DATASET.texts(**bad_filter))

textacy/__main__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
"oxford_text_archive": datasets.OxfordTextArchive,
2828
"reddit_comments": datasets.RedditComments,
2929
"supreme_court": datasets.SupremeCourt,
30+
"udhr": datasets.UDHR,
3031
"wikinews": datasets.Wikinews,
3132
"wikipedia": datasets.Wikipedia,
3233
"concept_net": resources.ConceptNet,

textacy/datasets/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,4 @@
44
from .reddit_comments import RedditComments
55
from .supreme_court import SupremeCourt
66
from .wikimedia import Wikipedia, Wikinews
7+
from .udhr import UDHR

textacy/datasets/capitol_words.py

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# -*- coding: utf-8 -*-
22
"""
3-
Capitol Words
4-
-------------
3+
Capitol Words Congressional speeches
4+
------------------------------------
55
66
A collection of ~11k (almost all) speeches given by the main protagonists of the
77
2016 U.S. Presidential election that had previously served in the U.S. Congress --
@@ -53,31 +53,31 @@ class CapitolWords(Dataset):
5353
Download the data (one time only!) from the textacy-data repo
5454
(https://github.com/bdewilde/textacy-data), and save its contents to disk::
5555
56-
>>> cw = CapitolWords()
57-
>>> cw.download()
58-
>>> cw.info
56+
>>> ds = CapitolWords()
57+
>>> ds.download()
58+
>>> ds.info
5959
{'name': 'capitol_words',
6060
'site_url': 'http://sunlightlabs.github.io/Capitol-Words/',
6161
'description': 'Collection of ~11k speeches in the Congressional Record given by notable U.S. politicians between Jan 1996 and Jun 2016.'}
6262
6363
Iterate over speeches as texts or records with both text and metadata::
6464
65-
>>> for text in cw.texts(limit=3):
65+
>>> for text in ds.texts(limit=3):
6666
... print(text, end="\\n\\n")
67-
>>> for text, meta in cw.records(limit=3):
67+
>>> for text, meta in ds.records(limit=3):
6868
... print("\\n{} ({})\\n{}".format(meta["title"], meta["speaker_name"], text))
6969
7070
Filter speeches by a variety of metadata fields and text length::
7171
72-
>>> for text, meta in cw.records(speaker_name="Bernie Sanders", limit=3):
72+
>>> for text, meta in ds.records(speaker_name="Bernie Sanders", limit=3):
7373
... print("\\n{}, {}\\n{}".format(meta["title"], meta["date"], text))
74-
>>> for text, meta in cw.records(speaker_party="D", congress={110, 111, 112},
74+
>>> for text, meta in ds.records(speaker_party="D", congress={110, 111, 112},
7575
... chamber="Senate", limit=3):
7676
... print(meta["title"], meta["speaker_name"], meta["date"])
77-
>>> for text, meta in cw.records(speaker_name={"Barack Obama", "Hillary Clinton"},
77+
>>> for text, meta in ds.records(speaker_name={"Barack Obama", "Hillary Clinton"},
7878
... date_range=("2005-01-01", "2005-12-31")):
7979
... print(meta["title"], meta["speaker_name"], meta["date"])
80-
>>> for text in cw.texts(min_len=50000):
80+
>>> for text in ds.texts(min_len=50000):
8181
... print(len(text))
8282
8383
Stream speeches into a :class:`textacy.Corpus <textacy.corpus.Corpus>`::

textacy/datasets/imdb.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
"""
2-
IMDB Reviews
3-
------------
2+
IMDB movie reviews
3+
------------------
44
55
A collection of 50k highly polar movie reviews posted to IMDB, split evenly
66
into training and testing sets, with 25k positive and 25k negative sentiment labels,
@@ -54,9 +54,9 @@ class IMDB(Dataset):
5454
5555
Download the data (one time only!), saving and extracting its contents to disk::
5656
57-
>>> imdb = IMDB()
58-
>>> imdb.download()
59-
>>> imdb.info
57+
>>> ds = IMDB()
58+
>>> ds.download()
59+
>>> ds.info
6060
{'name': 'imdb',
6161
'site_url': 'http://ai.stanford.edu/~amaas/data/sentiment',
6262
'description': 'Collection of 50k highly polar movie reviews split evenly into train and test sets, with 25k positive and 25k negative labels. Also includes some unlabeled reviews.'}

textacy/datasets/oxford_text_archive.py

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
"""
2-
Oxford Text Archive
3-
-------------------
2+
Oxford Text Archive literary works
3+
----------------------------------
44
55
A collection of ~2.7k Creative Commons literary works from the Oxford Text Archive,
66
containing primarily English-language 16th-20th century literature and history.
@@ -51,33 +51,33 @@ class OxfordTextArchive(Dataset):
5151
5252
Download the data (one time only!), saving and extracting its contents to disk::
5353
54-
>>> ota = OxfordTextArchive()
55-
>>> ota.download()
56-
>>> ota.info
54+
>>> ds = OxfordTextArchive()
55+
>>> ds.download()
56+
>>> ds.info
5757
{'name': 'oxford_text_archive',
5858
'site_url': 'https://ota.ox.ac.uk/',
5959
'description': 'Collection of ~2.7k Creative Commons texts from the Oxford Text Archive, containing primarily English-language 16th-20th century literature and history.'}
6060
6161
Iterate over literary works as texts or records with both text and metadata::
6262
63-
>>> for text in ota.texts(limit=3):
63+
>>> for text in ds.texts(limit=3):
6464
... print(text[:200])
65-
>>> for text, meta in ota.records(limit=3):
65+
>>> for text, meta in ds.records(limit=3):
6666
... print("\\n{}, {}".format(meta["title"], meta["year"]))
6767
... print(text[:300])
6868
6969
Filter literary works by a variety of metadata fields and text length::
7070
71-
>>> for text, meta in ota.records(author="Shakespeare, William", limit=1):
71+
>>> for text, meta in ds.records(author="Shakespeare, William", limit=1):
7272
... print("{}\\n{}".format(meta["title"], text[:500]))
73-
>>> for text, meta in ota.records(date_range=("1900-01-01", "1990-01-01"), limit=5):
73+
>>> for text, meta in ds.records(date_range=("1900-01-01", "1990-01-01"), limit=5):
7474
... print(meta["year"], meta["author"])
75-
>>> for text in ota.texts(min_len=4000000):
75+
>>> for text in ds.texts(min_len=4000000):
7676
... print(len(text))
7777
7878
Stream literary works into a :class:`textacy.Corpus <textacy.corpus.Corpus>`::
7979
80-
>>> textacy.Corpus("en", data=ota.records(limit=5))
80+
>>> textacy.Corpus("en", data=ds.records(limit=5))
8181
Corpus(5 docs; 182289 tokens)
8282
8383
Args:

0 commit comments

Comments
 (0)