Source code for nltk.test.unit.test_json2csv_corpus
# Natural Language Toolkit: Twitter client
#
# Copyright (C) 2001-2024 NLTK Project
# Author: Lorenzo Rubio <lrnzcig@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Regression tests for `json2csv()` and `json2csv_entities()` in Twitter
package.
"""
from pathlib import Path
import pytest
from nltk.corpus import twitter_samples
from nltk.twitter.common import json2csv, json2csv_entities
[docs]
def files_are_identical(pathA, pathB):
"""
Compare two files, ignoring carriage returns,
leading whitespace, and trailing whitespace
"""
f1 = [l.strip() for l in pathA.read_bytes().splitlines()]
f2 = [l.strip() for l in pathB.read_bytes().splitlines()]
return f1 == f2
subdir = Path(__file__).parent / "files"
[docs]
@pytest.fixture
def infile():
with open(twitter_samples.abspath("tweets.20150430-223406.json")) as infile:
return [next(infile) for x in range(100)]
[docs]
def test_textoutput(tmp_path, infile):
ref_fn = subdir / "tweets.20150430-223406.text.csv.ref"
outfn = tmp_path / "tweets.20150430-223406.text.csv"
json2csv(infile, outfn, ["text"], gzip_compress=False)
assert files_are_identical(outfn, ref_fn)
[docs]
def test_tweet_metadata(tmp_path, infile):
ref_fn = subdir / "tweets.20150430-223406.tweet.csv.ref"
fields = [
"created_at",
"favorite_count",
"id",
"in_reply_to_status_id",
"in_reply_to_user_id",
"retweet_count",
"retweeted",
"text",
"truncated",
"user.id",
]
outfn = tmp_path / "tweets.20150430-223406.tweet.csv"
json2csv(infile, outfn, fields, gzip_compress=False)
assert files_are_identical(outfn, ref_fn)
[docs]
def test_user_metadata(tmp_path, infile):
ref_fn = subdir / "tweets.20150430-223406.user.csv.ref"
fields = ["id", "text", "user.id", "user.followers_count", "user.friends_count"]
outfn = tmp_path / "tweets.20150430-223406.user.csv"
json2csv(infile, outfn, fields, gzip_compress=False)
assert files_are_identical(outfn, ref_fn)
[docs]
def test_tweet_hashtag(tmp_path, infile):
ref_fn = subdir / "tweets.20150430-223406.hashtag.csv.ref"
outfn = tmp_path / "tweets.20150430-223406.hashtag.csv"
json2csv_entities(
infile,
outfn,
["id", "text"],
"hashtags",
["text"],
gzip_compress=False,
)
assert files_are_identical(outfn, ref_fn)
[docs]
def test_tweet_usermention(tmp_path, infile):
ref_fn = subdir / "tweets.20150430-223406.usermention.csv.ref"
outfn = tmp_path / "tweets.20150430-223406.usermention.csv"
json2csv_entities(
infile,
outfn,
["id", "text"],
"user_mentions",
["id", "screen_name"],
gzip_compress=False,
)
assert files_are_identical(outfn, ref_fn)
[docs]
def test_tweet_media(tmp_path, infile):
ref_fn = subdir / "tweets.20150430-223406.media.csv.ref"
outfn = tmp_path / "tweets.20150430-223406.media.csv"
json2csv_entities(
infile,
outfn,
["id"],
"media",
["media_url", "url"],
gzip_compress=False,
)
assert files_are_identical(outfn, ref_fn)
[docs]
def test_tweet_url(tmp_path, infile):
ref_fn = subdir / "tweets.20150430-223406.url.csv.ref"
outfn = tmp_path / "tweets.20150430-223406.url.csv"
json2csv_entities(
infile,
outfn,
["id"],
"urls",
["url", "expanded_url"],
gzip_compress=False,
)
assert files_are_identical(outfn, ref_fn)
[docs]
def test_userurl(tmp_path, infile):
ref_fn = subdir / "tweets.20150430-223406.userurl.csv.ref"
outfn = tmp_path / "tweets.20150430-223406.userurl.csv"
json2csv_entities(
infile,
outfn,
["id", "screen_name"],
"user.urls",
["url", "expanded_url"],
gzip_compress=False,
)
assert files_are_identical(outfn, ref_fn)
[docs]
def test_tweet_place(tmp_path, infile):
ref_fn = subdir / "tweets.20150430-223406.place.csv.ref"
outfn = tmp_path / "tweets.20150430-223406.place.csv"
json2csv_entities(
infile,
outfn,
["id", "text"],
"place",
["name", "country"],
gzip_compress=False,
)
assert files_are_identical(outfn, ref_fn)
[docs]
def test_tweet_place_boundingbox(tmp_path, infile):
ref_fn = subdir / "tweets.20150430-223406.placeboundingbox.csv.ref"
outfn = tmp_path / "tweets.20150430-223406.placeboundingbox.csv"
json2csv_entities(
infile,
outfn,
["id", "name"],
"place.bounding_box",
["coordinates"],
gzip_compress=False,
)
assert files_are_identical(outfn, ref_fn)
[docs]
def test_retweet_original_tweet(tmp_path, infile):
ref_fn = subdir / "tweets.20150430-223406.retweet.csv.ref"
outfn = tmp_path / "tweets.20150430-223406.retweet.csv"
json2csv_entities(
infile,
outfn,
["id"],
"retweeted_status",
[
"created_at",
"favorite_count",
"id",
"in_reply_to_status_id",
"in_reply_to_user_id",
"retweet_count",
"text",
"truncated",
"user.id",
],
gzip_compress=False,
)
assert files_are_identical(outfn, ref_fn)
[docs]
def test_file_is_wrong(tmp_path, infile):
"""
Sanity check that file comparison is not giving false positives.
"""
ref_fn = subdir / "tweets.20150430-223406.retweet.csv.ref"
outfn = tmp_path / "tweets.20150430-223406.text.csv"
json2csv(infile, outfn, ["text"], gzip_compress=False)
assert not files_are_identical(outfn, ref_fn)