Merge pull request #271 from chartbeat-labs/feature/add-udhr-dataset

bdewilde · web-flow · commit 17333a0d103a · 2019-08-30T16:20:03.000-05:00
Add UDHR dataset
diff --git a/docs/source/api_reference/datasets.rst b/docs/source/api_reference/datasets.rst
@@ -15,4 +15,6 @@ Datasets
 
 .. automodule:: textacy.datasets.imdb
 
+.. automodule:: textacy.datasets.udhr
+
 .. automodule:: textacy.datasets.utils
diff --git a/tests/datasets/test_capitol_words.py b/tests/datasets/test_capitol_words.py
@@ -15,7 +15,7 @@
 
 @pytest.mark.skip("No need to download a new dataset every time")
 def test_download(tmpdir):
-    dataset = CapitolWords(data_dir=str(tempdir))
+    dataset = CapitolWords(data_dir=str(tmpdir))
     dataset.download()
     assert os.path.isfile(dataset._filepath)
 
diff --git a/tests/datasets/test_imdb.py b/tests/datasets/test_imdb.py
@@ -26,7 +26,7 @@ def _skipif():
 def test_download(tmpdir):
     dataset = IMDB(data_dir=str(tmpdir))
     dataset.download()
-    assert all(os.path.isfile(filepath) for filepath in dataset.filepaths)
+    assert dataset.data_dir.joinpath("aclImdb").is_dir()
 
 
 def test_oserror(tmpdir):
diff --git a/tests/datasets/test_oxford_text_archive.py b/tests/datasets/test_oxford_text_archive.py
@@ -15,7 +15,7 @@
 
 @pytest.mark.skip("No need to download a new dataset every time")
 def test_download(tmpdir):
-    dataset = OxfordTextArchive(data_dir=str(tempdir))
+    dataset = OxfordTextArchive(data_dir=str(tmpdir))
     dataset.download()
     assert os.path.isfile(dataset._metadata_filepath)
     assert os.path.isdir(dataset._text_dirpath)
diff --git a/tests/datasets/test_udhr.py b/tests/datasets/test_udhr.py
@@ -0,0 +1,78 @@
+import os
+
+import pytest
+
+import textacy
+from textacy.datasets.udhr import UDHR
+
+DATASET = UDHR()
+
+
+def _skipif():
+    try:
+        DATASET._check_data()
+        return False
+    except OSError:
+        return True
+
+
+pytestmark = pytest.mark.skipif(
+    _skipif(),
+    reason="UDHR dataset must be downloaded before running tests",
+)
+
+
+@pytest.mark.skip("No need to download a new dataset every time")
+def test_download(tmpdir):
+    dataset = UDHR(data_dir=str(tmpdir))
+    dataset.download()
+    assert os.path.isfile(dataset._index_filepath)
+    assert os.path.isdir(dataset._texts_dirpath)
+
+
+def test_oserror(tmpdir):
+    dataset = UDHR(data_dir=str(tmpdir))
+    with pytest.raises(OSError):
+        _ = list(dataset.texts())
+
+
+def test_texts():
+    texts = list(DATASET.texts(limit=3))
+    assert len(texts) > 0
+    for text in texts:
+        assert isinstance(text, str)
+
+
+def test_texts_limit():
+    for limit in (1, 5, 10):
+        assert sum(1 for _ in DATASET.texts(limit=limit)) == limit
+
+
+def test_records():
+    for text, meta in DATASET.records(limit=3):
+        assert isinstance(text, str)
+        assert isinstance(meta, dict)
+
+
+def test_records_lang():
+    langs = ({"en"}, {"en", "es"})
+    for lang in langs:
+        records = list(DATASET.records(lang=lang, limit=10))
+        assert all(meta["lang"] in lang for _, meta in records)
+
+
+def test_bad_filters():
+    bad_filters = (
+        {"lang": "xx"},
+        {"lang": ["en", "un"]},
+    )
+    for bad_filter in bad_filters:
+        with pytest.raises(ValueError):
+            list(DATASET.texts(**bad_filter))
+    bad_filters = (
+        {"lang": True},
+        {"lang": textacy.load_spacy_lang("en")},
+    )
+    for bad_filter in bad_filters:
+        with pytest.raises(TypeError):
+            list(DATASET.texts(**bad_filter))
diff --git a/textacy/__main__.py b/textacy/__main__.py
@@ -27,6 +27,7 @@
     "oxford_text_archive": datasets.OxfordTextArchive,
     "reddit_comments": datasets.RedditComments,
     "supreme_court": datasets.SupremeCourt,
+    "udhr": datasets.UDHR,
     "wikinews": datasets.Wikinews,
     "wikipedia": datasets.Wikipedia,
     "concept_net": resources.ConceptNet,
diff --git a/textacy/datasets/__init__.py b/textacy/datasets/__init__.py
@@ -4,3 +4,4 @@
 from .reddit_comments import RedditComments
 from .supreme_court import SupremeCourt
 from .wikimedia import Wikipedia, Wikinews
+from .udhr import UDHR
diff --git a/textacy/datasets/capitol_words.py b/textacy/datasets/capitol_words.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 """
-Capitol Words
--------------
+Capitol Words Congressional speeches
+------------------------------------
 
 A collection of ~11k (almost all) speeches given by the main protagonists of the
 2016 U.S. Presidential election that had previously served in the U.S. Congress --
@@ -53,31 +53,31 @@ class CapitolWords(Dataset):
     Download the data (one time only!) from the textacy-data repo
     (https://github.com/bdewilde/textacy-data), and save its contents to disk::
 
-        >>> cw = CapitolWords()
-        >>> cw.download()
-        >>> cw.info
+        >>> ds = CapitolWords()
+        >>> ds.download()
+        >>> ds.info
         {'name': 'capitol_words',
          'site_url': 'http://sunlightlabs.github.io/Capitol-Words/',
          'description': 'Collection of ~11k speeches in the Congressional Record given by notable U.S. politicians between Jan 1996 and Jun 2016.'}
 
     Iterate over speeches as texts or records with both text and metadata::
 
-        >>> for text in cw.texts(limit=3):
+        >>> for text in ds.texts(limit=3):
         ...     print(text, end="\\n\\n")
-        >>> for text, meta in cw.records(limit=3):
+        >>> for text, meta in ds.records(limit=3):
         ...     print("\\n{} ({})\\n{}".format(meta["title"], meta["speaker_name"], text))
 
     Filter speeches by a variety of metadata fields and text length::
 
-        >>> for text, meta in cw.records(speaker_name="Bernie Sanders", limit=3):
+        >>> for text, meta in ds.records(speaker_name="Bernie Sanders", limit=3):
         ...     print("\\n{}, {}\\n{}".format(meta["title"], meta["date"], text))
-        >>> for text, meta in cw.records(speaker_party="D", congress={110, 111, 112},
+        >>> for text, meta in ds.records(speaker_party="D", congress={110, 111, 112},
         ...                          chamber="Senate", limit=3):
         ...     print(meta["title"], meta["speaker_name"], meta["date"])
-        >>> for text, meta in cw.records(speaker_name={"Barack Obama", "Hillary Clinton"},
+        >>> for text, meta in ds.records(speaker_name={"Barack Obama", "Hillary Clinton"},
         ...                              date_range=("2005-01-01", "2005-12-31")):
         ...     print(meta["title"], meta["speaker_name"], meta["date"])
-        >>> for text in cw.texts(min_len=50000):
+        >>> for text in ds.texts(min_len=50000):
         ...     print(len(text))
 
     Stream speeches into a :class:`textacy.Corpus <textacy.corpus.Corpus>`::
diff --git a/textacy/datasets/imdb.py b/textacy/datasets/imdb.py
@@ -1,6 +1,6 @@
 """
-IMDB Reviews
-------------
+IMDB movie reviews
+------------------
 
 A collection of 50k highly polar movie reviews posted to IMDB, split evenly
 into training and testing sets, with 25k positive and 25k negative sentiment labels,
@@ -54,9 +54,9 @@ class IMDB(Dataset):
 
     Download the data (one time only!), saving and extracting its contents to disk::
 
-        >>> imdb = IMDB()
-        >>> imdb.download()
-        >>> imdb.info
+        >>> ds = IMDB()
+        >>> ds.download()
+        >>> ds.info
         {'name': 'imdb',
          'site_url': 'http://ai.stanford.edu/~amaas/data/sentiment',
          'description': 'Collection of 50k highly polar movie reviews split evenly into train and test sets, with 25k positive and 25k negative labels. Also includes some unlabeled reviews.'}
diff --git a/textacy/datasets/oxford_text_archive.py b/textacy/datasets/oxford_text_archive.py
@@ -1,6 +1,6 @@
 """
-Oxford Text Archive
--------------------
+Oxford Text Archive literary works
+----------------------------------
 
 A collection of ~2.7k Creative Commons literary works from the Oxford Text Archive,
 containing primarily English-language 16th-20th century literature and history.
@@ -51,33 +51,33 @@ class OxfordTextArchive(Dataset):
 
     Download the data (one time only!), saving and extracting its contents to disk::
 
-        >>> ota = OxfordTextArchive()
-        >>> ota.download()
-        >>> ota.info
+        >>> ds = OxfordTextArchive()
+        >>> ds.download()
+        >>> ds.info
         {'name': 'oxford_text_archive',
          'site_url': 'https://ota.ox.ac.uk/',
          'description': 'Collection of ~2.7k Creative Commons texts from the Oxford Text Archive, containing primarily English-language 16th-20th century literature and history.'}
 
     Iterate over literary works as texts or records with both text and metadata::
 
-        >>> for text in ota.texts(limit=3):
+        >>> for text in ds.texts(limit=3):
         ...     print(text[:200])
-        >>> for text, meta in ota.records(limit=3):
+        >>> for text, meta in ds.records(limit=3):
         ...     print("\\n{}, {}".format(meta["title"], meta["year"]))
         ...     print(text[:300])
 
     Filter literary works by a variety of metadata fields and text length::
 
-        >>> for text, meta in ota.records(author="Shakespeare, William", limit=1):
+        >>> for text, meta in ds.records(author="Shakespeare, William", limit=1):
         ...     print("{}\\n{}".format(meta["title"], text[:500]))
-        >>> for text, meta in ota.records(date_range=("1900-01-01", "1990-01-01"), limit=5):
+        >>> for text, meta in ds.records(date_range=("1900-01-01", "1990-01-01"), limit=5):
         ...     print(meta["year"], meta["author"])
-        >>> for text in ota.texts(min_len=4000000):
+        >>> for text in ds.texts(min_len=4000000):
         ...     print(len(text))
 
     Stream literary works into a :class:`textacy.Corpus <textacy.corpus.Corpus>`::
 
-        >>> textacy.Corpus("en", data=ota.records(limit=5))
+        >>> textacy.Corpus("en", data=ds.records(limit=5))
         Corpus(5 docs; 182289 tokens)
 
     Args:
diff --git a/textacy/datasets/reddit_comments.py b/textacy/datasets/reddit_comments.py
@@ -1,5 +1,5 @@
 """
-Reddit Comments
+Reddit comments
 ---------------
 
 A collection of up to ~1.5 billion Reddit comments posted from
@@ -50,34 +50,34 @@ class RedditComments(Dataset):
 
     Download the data (one time only!) or subsets thereof by specifying a date range::
 
-        >>> rc = RedditComments()
-        >>> rc.download(date_range=("2007-10", "2008-03"))
-        >>> rc.info
+        >>> ds = RedditComments()
+        >>> ds.download(date_range=("2007-10", "2008-03"))
+        >>> ds.info
         {'name': 'reddit_comments',
          'site_url': 'https://archive.org/details/2015_reddit_comments_corpus',
          'description': 'Collection of ~1.5 billion publicly available Reddit comments from October 2007 through May 2015.'}
 
     Iterate over comments as texts or records with both text and metadata::
 
-        >>> for text in rc.texts(limit=5):
+        >>> for text in ds.texts(limit=5):
         ...     print(text)
-        >>> for text, meta in rc.records(limit=5):
+        >>> for text, meta in ds.records(limit=5):
         ...     print("\\n{} {}\\n{}".format(meta["author"], meta["created_utc"], text))
 
     Filter comments by a variety of metadata fields and text length::
 
-        >>> for text, meta in rc.records(subreddit="politics", limit=5):
+        >>> for text, meta in ds.records(subreddit="politics", limit=5):
         ...     print(meta["score"], ":", text)
-        >>> for text, meta in rc.records(date_range=("2008-01", "2008-03"), limit=5):
+        >>> for text, meta in ds.records(date_range=("2008-01", "2008-03"), limit=5):
         ...     print(meta["created_utc"])
-        >>> for text, meta in rc.records(score_range=(10, None), limit=5):
+        >>> for text, meta in ds.records(score_range=(10, None), limit=5):
         ...     print(meta["score"], ":", text)
-        >>> for text in rc.texts(min_len=2000, limit=5):
+        >>> for text in ds.texts(min_len=2000, limit=5):
         ...     print(len(text))
 
     Stream comments into a :class:`textacy.Corpus <textacy.corpus.Corpus>`::
 
-        >>> textacy.Corpus("en", data=rc.records(limit=1000))
+        >>> textacy.Corpus("en", data=ds.records(limit=1000))
         Corpus(1000 docs; 27582 tokens)
 
     Args:
diff --git a/textacy/datasets/supreme_court.py b/textacy/datasets/supreme_court.py
@@ -1,5 +1,5 @@
 """
-Supreme Court Decisions
+Supreme Court decisions
 -----------------------
 
 A collection of ~8.4k (almost all) decisions issued by the U.S. Supreme Court
@@ -76,36 +76,36 @@ class SupremeCourt(Dataset):
     Download the data (one time only!) from the textacy-data repo
     (https://github.com/bdewilde/textacy-data), and save its contents to disk::
 
-        >>> sc = SupremeCourt()
-        >>> sc.download()
-        >>> sc.info
+        >>> ds = SupremeCourt()
+        >>> ds.download()
+        >>> ds.info
         {'name': 'supreme_court',
          'site_url': 'http://caselaw.findlaw.com/court/us-supreme-court',
          'description': 'Collection of ~8.4k decisions issued by the U.S. Supreme Court between November 1946 and June 2016.'}
 
     Iterate over decisions as texts or records with both text and metadata::
 
-        >>> for text in sc.texts(limit=3):
+        >>> for text in ds.texts(limit=3):
         ...     print(text[:500], end="\\n\\n")
-        >>> for text, meta in sc.records(limit=3):
+        >>> for text, meta in ds.records(limit=3):
         ...     print("\\n{} ({})\\n{}".format(meta["case_name"], meta["decision_date"], text[:500]))
 
     Filter decisions by a variety of metadata fields and text length::
 
-        >>> for text, meta in sc.records(opinion_author=109, limit=3):  # Notorious RBG!
+        >>> for text, meta in ds.records(opinion_author=109, limit=3):  # Notorious RBG!
         ...     print(meta["case_name"], meta["decision_direction"], meta["n_maj_votes"])
-        >>> for text, meta in sc.records(decision_direction="liberal",
+        >>> for text, meta in ds.records(decision_direction="liberal",
         ...                              issue_area={1, 9, 10}, limit=3):
         ...     print(meta["case_name"], meta["maj_opinion_author"], meta["n_maj_votes"])
-        >>> for text, meta in sc.records(opinion_author=102, date_range=('1985-02-11', '1986-02-11')):
+        >>> for text, meta in ds.records(opinion_author=102, date_range=('1985-02-11', '1986-02-11')):
         ...     print("\\n{} ({})".format(meta["case_name"], meta["decision_date"]))
-        ...     print(sc.issue_codes[meta["issue"]], "=>", meta["decision_direction"])
-        >>> for text in sc.texts(min_len=250000):
+        ...     print(ds.issue_codes[meta["issue"]], "=>", meta["decision_direction"])
+        >>> for text in ds.texts(min_len=250000):
         ...     print(len(text))
 
     Stream decisions into a :class:`textacy.Corpus <textacy.corpus.Corpus>`::
 
-        >>> textacy.Corpus("en", data=sc.records(limit=25))
+        >>> textacy.Corpus("en", data=ds.records(limit=25))
         Corpus(25 docs; 136696 tokens)
 
     Args:
diff --git a/textacy/datasets/udhr.py b/textacy/datasets/udhr.py
diff --git a/textacy/datasets/wikimedia.py b/textacy/datasets/wikimedia.py