Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/source/api_reference/datasets.rst
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,6 @@ Datasets

.. automodule:: textacy.datasets.imdb

.. automodule:: textacy.datasets.udhr

.. automodule:: textacy.datasets.utils
2 changes: 1 addition & 1 deletion tests/datasets/test_capitol_words.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

@pytest.mark.skip("No need to download a new dataset every time")
def test_download(tmpdir):
dataset = CapitolWords(data_dir=str(tempdir))
dataset = CapitolWords(data_dir=str(tmpdir))
dataset.download()
assert os.path.isfile(dataset._filepath)

Expand Down
2 changes: 1 addition & 1 deletion tests/datasets/test_imdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def _skipif():
def test_download(tmpdir):
dataset = IMDB(data_dir=str(tmpdir))
dataset.download()
assert all(os.path.isfile(filepath) for filepath in dataset.filepaths)
assert dataset.data_dir.joinpath("aclImdb").is_dir()


def test_oserror(tmpdir):
Expand Down
2 changes: 1 addition & 1 deletion tests/datasets/test_oxford_text_archive.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

@pytest.mark.skip("No need to download a new dataset every time")
def test_download(tmpdir):
dataset = OxfordTextArchive(data_dir=str(tempdir))
dataset = OxfordTextArchive(data_dir=str(tmpdir))
dataset.download()
assert os.path.isfile(dataset._metadata_filepath)
assert os.path.isdir(dataset._text_dirpath)
Expand Down
78 changes: 78 additions & 0 deletions tests/datasets/test_udhr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import os

import pytest

import textacy
from textacy.datasets.udhr import UDHR

DATASET = UDHR()


def _skipif():
try:
DATASET._check_data()
return False
except OSError:
return True


pytestmark = pytest.mark.skipif(
_skipif(),
reason="UDHR dataset must be downloaded before running tests",
)


@pytest.mark.skip("No need to download a new dataset every time")
def test_download(tmpdir):
dataset = UDHR(data_dir=str(tmpdir))
dataset.download()
assert os.path.isfile(dataset._index_filepath)
assert os.path.isdir(dataset._texts_dirpath)


def test_oserror(tmpdir):
dataset = UDHR(data_dir=str(tmpdir))
with pytest.raises(OSError):
_ = list(dataset.texts())


def test_texts():
texts = list(DATASET.texts(limit=3))
assert len(texts) > 0
for text in texts:
assert isinstance(text, str)


def test_texts_limit():
for limit in (1, 5, 10):
assert sum(1 for _ in DATASET.texts(limit=limit)) == limit


def test_records():
for text, meta in DATASET.records(limit=3):
assert isinstance(text, str)
assert isinstance(meta, dict)


def test_records_lang():
langs = ({"en"}, {"en", "es"})
for lang in langs:
records = list(DATASET.records(lang=lang, limit=10))
assert all(meta["lang"] in lang for _, meta in records)


def test_bad_filters():
bad_filters = (
{"lang": "xx"},
{"lang": ["en", "un"]},
)
for bad_filter in bad_filters:
with pytest.raises(ValueError):
list(DATASET.texts(**bad_filter))
bad_filters = (
{"lang": True},
{"lang": textacy.load_spacy_lang("en")},
)
for bad_filter in bad_filters:
with pytest.raises(TypeError):
list(DATASET.texts(**bad_filter))
1 change: 1 addition & 0 deletions textacy/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
"oxford_text_archive": datasets.OxfordTextArchive,
"reddit_comments": datasets.RedditComments,
"supreme_court": datasets.SupremeCourt,
"udhr": datasets.UDHR,
"wikinews": datasets.Wikinews,
"wikipedia": datasets.Wikipedia,
"concept_net": resources.ConceptNet,
Expand Down
1 change: 1 addition & 0 deletions textacy/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@
from .reddit_comments import RedditComments
from .supreme_court import SupremeCourt
from .wikimedia import Wikipedia, Wikinews
from .udhr import UDHR
22 changes: 11 additions & 11 deletions textacy/datasets/capitol_words.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
"""
Capitol Words
-------------
Capitol Words Congressional speeches
------------------------------------

A collection of ~11k (almost all) speeches given by the main protagonists of the
2016 U.S. Presidential election that had previously served in the U.S. Congress --
Expand Down Expand Up @@ -53,31 +53,31 @@ class CapitolWords(Dataset):
Download the data (one time only!) from the textacy-data repo
(https://github.com/bdewilde/textacy-data), and save its contents to disk::

>>> cw = CapitolWords()
>>> cw.download()
>>> cw.info
>>> ds = CapitolWords()
>>> ds.download()
>>> ds.info
{'name': 'capitol_words',
'site_url': 'http://sunlightlabs.github.io/Capitol-Words/',
'description': 'Collection of ~11k speeches in the Congressional Record given by notable U.S. politicians between Jan 1996 and Jun 2016.'}

Iterate over speeches as texts or records with both text and metadata::

>>> for text in cw.texts(limit=3):
>>> for text in ds.texts(limit=3):
... print(text, end="\\n\\n")
>>> for text, meta in cw.records(limit=3):
>>> for text, meta in ds.records(limit=3):
... print("\\n{} ({})\\n{}".format(meta["title"], meta["speaker_name"], text))

Filter speeches by a variety of metadata fields and text length::

>>> for text, meta in cw.records(speaker_name="Bernie Sanders", limit=3):
>>> for text, meta in ds.records(speaker_name="Bernie Sanders", limit=3):
... print("\\n{}, {}\\n{}".format(meta["title"], meta["date"], text))
>>> for text, meta in cw.records(speaker_party="D", congress={110, 111, 112},
>>> for text, meta in ds.records(speaker_party="D", congress={110, 111, 112},
... chamber="Senate", limit=3):
... print(meta["title"], meta["speaker_name"], meta["date"])
>>> for text, meta in cw.records(speaker_name={"Barack Obama", "Hillary Clinton"},
>>> for text, meta in ds.records(speaker_name={"Barack Obama", "Hillary Clinton"},
... date_range=("2005-01-01", "2005-12-31")):
... print(meta["title"], meta["speaker_name"], meta["date"])
>>> for text in cw.texts(min_len=50000):
>>> for text in ds.texts(min_len=50000):
... print(len(text))

Stream speeches into a :class:`textacy.Corpus <textacy.corpus.Corpus>`::
Expand Down
10 changes: 5 additions & 5 deletions textacy/datasets/imdb.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""
IMDB Reviews
------------
IMDB movie reviews
------------------

A collection of 50k highly polar movie reviews posted to IMDB, split evenly
into training and testing sets, with 25k positive and 25k negative sentiment labels,
Expand Down Expand Up @@ -54,9 +54,9 @@ class IMDB(Dataset):

Download the data (one time only!), saving and extracting its contents to disk::

>>> imdb = IMDB()
>>> imdb.download()
>>> imdb.info
>>> ds = IMDB()
>>> ds.download()
>>> ds.info
{'name': 'imdb',
'site_url': 'http://ai.stanford.edu/~amaas/data/sentiment',
'description': 'Collection of 50k highly polar movie reviews split evenly into train and test sets, with 25k positive and 25k negative labels. Also includes some unlabeled reviews.'}
Expand Down
22 changes: 11 additions & 11 deletions textacy/datasets/oxford_text_archive.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""
Oxford Text Archive
-------------------
Oxford Text Archive literary works
----------------------------------

A collection of ~2.7k Creative Commons literary works from the Oxford Text Archive,
containing primarily English-language 16th-20th century literature and history.
Expand Down Expand Up @@ -51,33 +51,33 @@ class OxfordTextArchive(Dataset):

Download the data (one time only!), saving and extracting its contents to disk::

>>> ota = OxfordTextArchive()
>>> ota.download()
>>> ota.info
>>> ds = OxfordTextArchive()
>>> ds.download()
>>> ds.info
{'name': 'oxford_text_archive',
'site_url': 'https://ota.ox.ac.uk/',
'description': 'Collection of ~2.7k Creative Commons texts from the Oxford Text Archive, containing primarily English-language 16th-20th century literature and history.'}

Iterate over literary works as texts or records with both text and metadata::

>>> for text in ota.texts(limit=3):
>>> for text in ds.texts(limit=3):
... print(text[:200])
>>> for text, meta in ota.records(limit=3):
>>> for text, meta in ds.records(limit=3):
... print("\\n{}, {}".format(meta["title"], meta["year"]))
... print(text[:300])

Filter literary works by a variety of metadata fields and text length::

>>> for text, meta in ota.records(author="Shakespeare, William", limit=1):
>>> for text, meta in ds.records(author="Shakespeare, William", limit=1):
... print("{}\\n{}".format(meta["title"], text[:500]))
>>> for text, meta in ota.records(date_range=("1900-01-01", "1990-01-01"), limit=5):
>>> for text, meta in ds.records(date_range=("1900-01-01", "1990-01-01"), limit=5):
... print(meta["year"], meta["author"])
>>> for text in ota.texts(min_len=4000000):
>>> for text in ds.texts(min_len=4000000):
... print(len(text))

Stream literary works into a :class:`textacy.Corpus <textacy.corpus.Corpus>`::

>>> textacy.Corpus("en", data=ota.records(limit=5))
>>> textacy.Corpus("en", data=ds.records(limit=5))
Corpus(5 docs; 182289 tokens)

Args:
Expand Down
22 changes: 11 additions & 11 deletions textacy/datasets/reddit_comments.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
Reddit Comments
Reddit comments
---------------

A collection of up to ~1.5 billion Reddit comments posted from
Expand Down Expand Up @@ -50,34 +50,34 @@ class RedditComments(Dataset):

Download the data (one time only!) or subsets thereof by specifying a date range::

>>> rc = RedditComments()
>>> rc.download(date_range=("2007-10", "2008-03"))
>>> rc.info
>>> ds = RedditComments()
>>> ds.download(date_range=("2007-10", "2008-03"))
>>> ds.info
{'name': 'reddit_comments',
'site_url': 'https://archive.org/details/2015_reddit_comments_corpus',
'description': 'Collection of ~1.5 billion publicly available Reddit comments from October 2007 through May 2015.'}

Iterate over comments as texts or records with both text and metadata::

>>> for text in rc.texts(limit=5):
>>> for text in ds.texts(limit=5):
... print(text)
>>> for text, meta in rc.records(limit=5):
>>> for text, meta in ds.records(limit=5):
... print("\\n{} {}\\n{}".format(meta["author"], meta["created_utc"], text))

Filter comments by a variety of metadata fields and text length::

>>> for text, meta in rc.records(subreddit="politics", limit=5):
>>> for text, meta in ds.records(subreddit="politics", limit=5):
... print(meta["score"], ":", text)
>>> for text, meta in rc.records(date_range=("2008-01", "2008-03"), limit=5):
>>> for text, meta in ds.records(date_range=("2008-01", "2008-03"), limit=5):
... print(meta["created_utc"])
>>> for text, meta in rc.records(score_range=(10, None), limit=5):
>>> for text, meta in ds.records(score_range=(10, None), limit=5):
... print(meta["score"], ":", text)
>>> for text in rc.texts(min_len=2000, limit=5):
>>> for text in ds.texts(min_len=2000, limit=5):
... print(len(text))

Stream comments into a :class:`textacy.Corpus <textacy.corpus.Corpus>`::

>>> textacy.Corpus("en", data=rc.records(limit=1000))
>>> textacy.Corpus("en", data=ds.records(limit=1000))
Corpus(1000 docs; 27582 tokens)

Args:
Expand Down
24 changes: 12 additions & 12 deletions textacy/datasets/supreme_court.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
Supreme Court Decisions
Supreme Court decisions
-----------------------

A collection of ~8.4k (almost all) decisions issued by the U.S. Supreme Court
Expand Down Expand Up @@ -76,36 +76,36 @@ class SupremeCourt(Dataset):
Download the data (one time only!) from the textacy-data repo
(https://github.com/bdewilde/textacy-data), and save its contents to disk::

>>> sc = SupremeCourt()
>>> sc.download()
>>> sc.info
>>> ds = SupremeCourt()
>>> ds.download()
>>> ds.info
{'name': 'supreme_court',
'site_url': 'http://caselaw.findlaw.com/court/us-supreme-court',
'description': 'Collection of ~8.4k decisions issued by the U.S. Supreme Court between November 1946 and June 2016.'}

Iterate over decisions as texts or records with both text and metadata::

>>> for text in sc.texts(limit=3):
>>> for text in ds.texts(limit=3):
... print(text[:500], end="\\n\\n")
>>> for text, meta in sc.records(limit=3):
>>> for text, meta in ds.records(limit=3):
... print("\\n{} ({})\\n{}".format(meta["case_name"], meta["decision_date"], text[:500]))

Filter decisions by a variety of metadata fields and text length::

>>> for text, meta in sc.records(opinion_author=109, limit=3): # Notorious RBG!
>>> for text, meta in ds.records(opinion_author=109, limit=3): # Notorious RBG!
... print(meta["case_name"], meta["decision_direction"], meta["n_maj_votes"])
>>> for text, meta in sc.records(decision_direction="liberal",
>>> for text, meta in ds.records(decision_direction="liberal",
... issue_area={1, 9, 10}, limit=3):
... print(meta["case_name"], meta["maj_opinion_author"], meta["n_maj_votes"])
>>> for text, meta in sc.records(opinion_author=102, date_range=('1985-02-11', '1986-02-11')):
>>> for text, meta in ds.records(opinion_author=102, date_range=('1985-02-11', '1986-02-11')):
... print("\\n{} ({})".format(meta["case_name"], meta["decision_date"]))
... print(sc.issue_codes[meta["issue"]], "=>", meta["decision_direction"])
>>> for text in sc.texts(min_len=250000):
... print(ds.issue_codes[meta["issue"]], "=>", meta["decision_direction"])
>>> for text in ds.texts(min_len=250000):
... print(len(text))

Stream decisions into a :class:`textacy.Corpus <textacy.corpus.Corpus>`::

>>> textacy.Corpus("en", data=sc.records(limit=25))
>>> textacy.Corpus("en", data=ds.records(limit=25))
Corpus(25 docs; 136696 tokens)

Args:
Expand Down
Loading