def result_format(json, keyorder, link):
message = ""
if keyorder:
try:
# the output of sorted function is an ordered list of tuples
ordered_result = sorted(json.items(), key=lambda i:keyorder.index(i[0]))
except Exception as e:
ordered_result = []
message = Utilities.get_html_from_list(ordered_result)
else:
message = Utilities.get_html_from_dictionary(json)
# add helper link if there is such
if link:
message += '<a href=\"%s\">See more</a>' % link
return message
python类items()的实例源码
def _get_entity_recursive(json, entity):
if not json:
return None
elif isinstance(json, dict):
for key, value in json.items():
if key == entity:
return value
# 'entities' and 'extended_entities' are wrappers in Twitter json
# structure that contain other Twitter objects. See:
# https://dev.twitter.com/overview/api/entities-in-twitter-objects
if key == 'entities' or key == 'extended_entities':
candidate = _get_entity_recursive(value, entity)
if candidate is not None:
return candidate
return None
elif isinstance(json, list):
for item in json:
candidate = _get_entity_recursive(item, entity)
if candidate is not None:
return candidate
return None
else:
return None
common.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 27
收藏 0
点赞 0
评论 0
def _get_entity_recursive(json, entity):
if not json:
return None
elif isinstance(json, dict):
for key, value in json.items():
if key == entity:
return value
# 'entities' and 'extended_entities' are wrappers in Twitter json
# structure that contain other Twitter objects. See:
# https://dev.twitter.com/overview/api/entities-in-twitter-objects
if key == 'entities' or key == 'extended_entities':
candidate = _get_entity_recursive(value, entity)
if candidate is not None:
return candidate
return None
elif isinstance(json, list):
for item in json:
candidate = _get_entity_recursive(item, entity)
if candidate is not None:
return candidate
return None
else:
return None
def _get_entity_recursive(json, entity):
if not json:
return None
elif isinstance(json, dict):
for key, value in json.items():
if key == entity:
return value
# 'entities' and 'extended_entities' are wrappers in Twitter json
# structure that contain other Twitter objects. See:
# https://dev.twitter.com/overview/api/entities-in-twitter-objects
if key == 'entities' or key == 'extended_entities':
candidate = _get_entity_recursive(value, entity)
if candidate is not None:
return candidate
return None
elif isinstance(json, list):
for item in json:
candidate = _get_entity_recursive(item, entity)
if candidate is not None:
return candidate
return None
else:
return None
def _get_entity_recursive(json, entity):
if not json:
return None
elif isinstance(json, dict):
for key, value in json.items():
if key == entity:
return value
# 'entities' and 'extended_entities' are wrappers in Twitter json
# structure that contain other Twitter objects. See:
# https://dev.twitter.com/overview/api/entities-in-twitter-objects
if key == 'entities' or key == 'extended_entities':
candidate = _get_entity_recursive(value, entity)
if candidate is not None:
return candidate
return None
elif isinstance(json, list):
for item in json:
candidate = _get_entity_recursive(item, entity)
if candidate is not None:
return candidate
return None
else:
return None
def _get_entity_recursive(json, entity):
if not json:
return None
elif isinstance(json, dict):
for key, value in json.items():
if key == entity:
return value
# 'entities' and 'extended_entities' are wrappers in Twitter json
# structure that contain other Twitter objects. See:
# https://dev.twitter.com/overview/api/entities-in-twitter-objects
if key == 'entities' or key == 'extended_entities':
candidate = _get_entity_recursive(value, entity)
if candidate is not None:
return candidate
return None
elif isinstance(json, list):
for item in json:
candidate = _get_entity_recursive(item, entity)
if candidate is not None:
return candidate
return None
else:
return None
def _get_entity_recursive(json, entity):
if not json:
return None
elif isinstance(json, dict):
for key, value in json.items():
if key == entity:
return value
# 'entities' and 'extended_entities' are wrappers in Twitter json
# structure that contain other Twitter objects. See:
# https://dev.twitter.com/overview/api/entities-in-twitter-objects
if key == 'entities' or key == 'extended_entities':
candidate = _get_entity_recursive(value, entity)
if candidate is not None:
return candidate
return None
elif isinstance(json, list):
for item in json:
candidate = _get_entity_recursive(item, entity)
if candidate is not None:
return candidate
return None
else:
return None
def _get_entity_recursive(json, entity):
if not json:
return None
elif isinstance(json, dict):
for key, value in json.items():
if key == entity:
return value
# 'entities' and 'extended_entities' are wrappers in Twitter json
# structure that contain other Twitter objects. See:
# https://dev.twitter.com/overview/api/entities-in-twitter-objects
if key == 'entities' or key == 'extended_entities':
candidate = _get_entity_recursive(value, entity)
if candidate is not None:
return candidate
return None
elif isinstance(json, list):
for item in json:
candidate = _get_entity_recursive(item, entity)
if candidate is not None:
return candidate
return None
else:
return None
def result_format(json, keyorder, link, style, location="", file_name=""):
message = ""
if keyorder:
try:
# the output of sorted function is an ordered list of tuples
ordered_result = sorted(json.items(), key=lambda i:keyorder.index(i[0]))
# print("ordered_result: " + str(ordered_result))
except Exception as e:
print(e)
ordered_result = []
message = Utilities.get_html_from_list(ordered_result, style)
else:
message = Utilities.get_html_from_dictionary(json)
# add helper link if there is one
if link:
# message += '<a style="color: white;" href=\"%s\">See more</a>' % link
message += '<a href=\"%s\" style=\"%s\">See more</a>' % (link, style['link'])
if location and file_name:
row = location[0]
col = location[1]
# message += '<br><a style="color: white;" href=\"%s\">Go To Document</a>' % (file_name + '$$$' + str(row) + ',' + str(col))
message += '<br><a href=\"%s\" style=\"%s\">Go To Document</a>' % ((file_name + '$$$' + str(row) + ',' + str(col)), style['link'])
return message
def deserialize_json(json):
"""
A helper method for deserializing json into Cytoscape.js elements.
:param json: json representation of Cytoscape.js element
:return: Cytoscape.js element object if json is valid, else json
"""
class_name = json.pop('__classname__', None)
if class_name == 'Element': # type(self).__name__:
obj = Element.__new__(Element) # Make instance without calling __init__
for key, value in json.items():
setattr(obj, key, value)
return obj
else:
return json
def _write_to_file(object_fields, items, entity_fields, writer):
if not items:
# it could be that the entity is just not present for the tweet
# e.g. tweet hashtag is always present, even as [], however
# tweet media may not be present
return
if isinstance(items, dict):
# this happens e.g. for "place" of a tweet
row = object_fields
# there might be composed keys in de list of required fields
entity_field_values = [x for x in entity_fields if not _is_composed_key(x)]
entity_field_composed = [x for x in entity_fields if _is_composed_key(x)]
for field in entity_field_values:
value = items[field]
if isinstance(value, list):
row += value
else:
row += [value]
# now check required dictionaries
for d in entity_field_composed:
kd, vd = _get_key_value_composed(d)
json_dict = items[kd]
if not isinstance(json_dict, dict):
raise RuntimeError("""Key {0} does not contain a dictionary
in the json file""".format(kd))
row += [json_dict[vd]]
writer.writerow(row)
return
# in general it is a list
for item in items:
row = object_fields + extract_fields(item, entity_fields)
writer.writerow(row)
common.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 26
收藏 0
点赞 0
评论 0
def _write_to_file(object_fields, items, entity_fields, writer):
if not items:
# it could be that the entity is just not present for the tweet
# e.g. tweet hashtag is always present, even as [], however
# tweet media may not be present
return
if isinstance(items, dict):
# this happens e.g. for "place" of a tweet
row = object_fields
# there might be composed keys in de list of required fields
entity_field_values = [x for x in entity_fields if not _is_composed_key(x)]
entity_field_composed = [x for x in entity_fields if _is_composed_key(x)]
for field in entity_field_values:
value = items[field]
if isinstance(value, list):
row += value
else:
row += [value]
# now check required dictionaries
for d in entity_field_composed:
kd, vd = _get_key_value_composed(d)
json_dict = items[kd]
if not isinstance(json_dict, dict):
raise RuntimeError("""Key {0} does not contain a dictionary
in the json file""".format(kd))
row += [json_dict[vd]]
writer.writerow(row)
return
# in general it is a list
for item in items:
row = object_fields + extract_fields(item, entity_fields)
writer.writerow(row)
def _write_to_file(object_fields, items, entity_fields, writer):
if not items:
# it could be that the entity is just not present for the tweet
# e.g. tweet hashtag is always present, even as [], however
# tweet media may not be present
return
if isinstance(items, dict):
# this happens e.g. for "place" of a tweet
row = object_fields
# there might be composed keys in de list of required fields
entity_field_values = [x for x in entity_fields if not _is_composed_key(x)]
entity_field_composed = [x for x in entity_fields if _is_composed_key(x)]
for field in entity_field_values:
value = items[field]
if isinstance(value, list):
row += value
else:
row += [value]
# now check required dictionaries
for d in entity_field_composed:
kd, vd = _get_key_value_composed(d)
json_dict = items[kd]
if not isinstance(json_dict, dict):
raise RuntimeError("""Key {0} does not contain a dictionary
in the json file""".format(kd))
row += [json_dict[vd]]
writer.writerow(row)
return
# in general it is a list
for item in items:
row = object_fields + extract_fields(item, entity_fields)
writer.writerow(row)
def _write_to_file(object_fields, items, entity_fields, writer):
if not items:
# it could be that the entity is just not present for the tweet
# e.g. tweet hashtag is always present, even as [], however
# tweet media may not be present
return
if isinstance(items, dict):
# this happens e.g. for "place" of a tweet
row = object_fields
# there might be composed keys in de list of required fields
entity_field_values = [x for x in entity_fields if not _is_composed_key(x)]
entity_field_composed = [x for x in entity_fields if _is_composed_key(x)]
for field in entity_field_values:
value = items[field]
if isinstance(value, list):
row += value
else:
row += [value]
# now check required dictionaries
for d in entity_field_composed:
kd, vd = _get_key_value_composed(d)
json_dict = items[kd]
if not isinstance(json_dict, dict):
raise RuntimeError("""Key {0} does not contain a dictionary
in the json file""".format(kd))
row += [json_dict[vd]]
writer.writerow(row)
return
# in general it is a list
for item in items:
row = object_fields + extract_fields(item, entity_fields)
writer.writerow(row)
def _write_to_file(object_fields, items, entity_fields, writer):
if not items:
# it could be that the entity is just not present for the tweet
# e.g. tweet hashtag is always present, even as [], however
# tweet media may not be present
return
if isinstance(items, dict):
# this happens e.g. for "place" of a tweet
row = object_fields
# there might be composed keys in de list of required fields
entity_field_values = [x for x in entity_fields if not _is_composed_key(x)]
entity_field_composed = [x for x in entity_fields if _is_composed_key(x)]
for field in entity_field_values:
value = items[field]
if isinstance(value, list):
row += value
else:
row += [value]
# now check required dictionaries
for d in entity_field_composed:
kd, vd = _get_key_value_composed(d)
json_dict = items[kd]
if not isinstance(json_dict, dict):
raise RuntimeError("""Key {0} does not contain a dictionary
in the json file""".format(kd))
row += [json_dict[vd]]
writer.writerow(row)
return
# in general it is a list
for item in items:
row = object_fields + extract_fields(item, entity_fields)
writer.writerow(row)
def _write_to_file(object_fields, items, entity_fields, writer):
if not items:
# it could be that the entity is just not present for the tweet
# e.g. tweet hashtag is always present, even as [], however
# tweet media may not be present
return
if isinstance(items, dict):
# this happens e.g. for "place" of a tweet
row = object_fields
# there might be composed keys in de list of required fields
entity_field_values = [x for x in entity_fields if not _is_composed_key(x)]
entity_field_composed = [x for x in entity_fields if _is_composed_key(x)]
for field in entity_field_values:
value = items[field]
if isinstance(value, list):
row += value
else:
row += [value]
# now check required dictionaries
for d in entity_field_composed:
kd, vd = _get_key_value_composed(d)
json_dict = items[kd]
if not isinstance(json_dict, dict):
raise RuntimeError("""Key {0} does not contain a dictionary
in the json file""".format(kd))
row += [json_dict[vd]]
writer.writerow(row)
return
# in general it is a list
for item in items:
row = object_fields + extract_fields(item, entity_fields)
writer.writerow(row)
def _write_to_file(object_fields, items, entity_fields, writer):
if not items:
# it could be that the entity is just not present for the tweet
# e.g. tweet hashtag is always present, even as [], however
# tweet media may not be present
return
if isinstance(items, dict):
# this happens e.g. for "place" of a tweet
row = object_fields
# there might be composed keys in de list of required fields
entity_field_values = [x for x in entity_fields if not _is_composed_key(x)]
entity_field_composed = [x for x in entity_fields if _is_composed_key(x)]
for field in entity_field_values:
value = items[field]
if isinstance(value, list):
row += value
else:
row += [value]
# now check required dictionaries
for d in entity_field_composed:
kd, vd = _get_key_value_composed(d)
json_dict = items[kd]
if not isinstance(json_dict, dict):
raise RuntimeError("""Key {0} does not contain a dictionary
in the json file""".format(kd))
row += [json_dict[vd]]
writer.writerow(row)
return
# in general it is a list
for item in items:
row = object_fields + extract_fields(item, entity_fields)
writer.writerow(row)
def match_selection(self, sel, tooltip_files, scope):
""" this method take care to the results, links and the keys which be implemented in the tooltip """
results = []
count = 0
dynamic_doc_arr = self.search_for_dynamic_doc(sel, scope)
if dynamic_doc_arr:
results += dynamic_doc_arr
else:
self.logger_msg += 'There is no documentation in dynamic doc\n'
for file in tooltip_files:
# search the parameter in json file
json_result = self.search_in_json(sel, file['file_name'])
items = []
if isinstance(json_result, dict):
items.append(json_result)
elif isinstance(json_result, list):
items += json_result
for item in items:
result = {}
if item:
result['json_result'] = item
result['file_name'] = file['file_name']
# get the correct link for the result
if 'link' not in item and \
'link' in file:
result['link'] = file['link']
results.append(result)
# get the keys from the result
keys = list(item.keys())
# add key to keyorder and count the change
count += self.update_keyorder_list(keys)
# if there is one change, save it in settings.
if count != 0:
self.save_keyorder_list()
return results
def json2csv_entities(tweets_file, outfile, main_fields, entity_type, entity_fields,
encoding='utf8', errors='replace', gzip_compress=False):
"""
Extract selected fields from a file of line-separated JSON tweets and
write to a file in CSV format.
This utility function allows a file of full Tweets to be easily converted
to a CSV file for easier processing of Twitter entities. For example, the
hashtags or media elements of a tweet can be extracted.
It returns one line per entity of a Tweet, e.g. if a tweet has two hashtags
there will be two lines in the output file, one per hashtag
:param tweets_file: the file-like object containing full Tweets
:param str outfile: The path of the text file where results should be\
written
:param list main_fields: The list of fields to be extracted from the main\
object, usually the tweet. Useful examples: 'id_str' for the tweetID. See\
<https://dev.twitter.com/overview/api/tweets> for a full list of fields.
e. g.: ['id_str'], ['id', 'text', 'favorite_count', 'retweet_count']
If `entity_type` is expressed with hierarchy, then it is the list of\
fields of the object that corresponds to the key of the entity_type,\
(e.g., for entity_type='user.urls', the fields in the main_fields list\
belong to the user object; for entity_type='place.bounding_box', the\
files in the main_field list belong to the place object of the tweet).
:param list entity_type: The name of the entity: 'hashtags', 'media',\
'urls' and 'user_mentions' for the tweet object. For a user object,\
this needs to be expressed with a hierarchy: `'user.urls'`. For the\
bounding box of the Tweet location, use `'place.bounding_box'`.
:param list entity_fields: The list of fields to be extracted from the\
entity. E.g. `['text']` (of the Tweet)
:param error: Behaviour for encoding errors, see\
https://docs.python.org/3/library/codecs.html#codec-base-classes
:param gzip_compress: if `True`, ouput files are compressed with gzip
"""
(writer, outf) = outf_writer_compat(outfile, encoding, errors, gzip_compress)
header = get_header_field_list(main_fields, entity_type, entity_fields)
writer.writerow(header)
for line in tweets_file:
tweet = json.loads(line)
if _is_composed_key(entity_type):
key, value = _get_key_value_composed(entity_type)
object_json = _get_entity_recursive(tweet, key)
if not object_json:
# this can happen in the case of "place"
continue
object_fields = extract_fields(object_json, main_fields)
items = _get_entity_recursive(object_json, value)
_write_to_file(object_fields, items, entity_fields, writer)
else:
tweet_fields = extract_fields(tweet, main_fields)
items = _get_entity_recursive(tweet, entity_type)
_write_to_file(tweet_fields, items, entity_fields, writer)
outf.close()
common.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 27
收藏 0
点赞 0
评论 0
def json2csv_entities(tweets_file, outfile, main_fields, entity_type, entity_fields,
encoding='utf8', errors='replace', gzip_compress=False):
"""
Extract selected fields from a file of line-separated JSON tweets and
write to a file in CSV format.
This utility function allows a file of full Tweets to be easily converted
to a CSV file for easier processing of Twitter entities. For example, the
hashtags or media elements of a tweet can be extracted.
It returns one line per entity of a Tweet, e.g. if a tweet has two hashtags
there will be two lines in the output file, one per hashtag
:param tweets_file: the file-like object containing full Tweets
:param str outfile: The path of the text file where results should be\
written
:param list main_fields: The list of fields to be extracted from the main\
object, usually the tweet. Useful examples: 'id_str' for the tweetID. See\
<https://dev.twitter.com/overview/api/tweets> for a full list of fields.
e. g.: ['id_str'], ['id', 'text', 'favorite_count', 'retweet_count']
If `entity_type` is expressed with hierarchy, then it is the list of\
fields of the object that corresponds to the key of the entity_type,\
(e.g., for entity_type='user.urls', the fields in the main_fields list\
belong to the user object; for entity_type='place.bounding_box', the\
files in the main_field list belong to the place object of the tweet).
:param list entity_type: The name of the entity: 'hashtags', 'media',\
'urls' and 'user_mentions' for the tweet object. For a user object,\
this needs to be expressed with a hierarchy: `'user.urls'`. For the\
bounding box of the Tweet location, use `'place.bounding_box'`.
:param list entity_fields: The list of fields to be extracted from the\
entity. E.g. `['text']` (of the Tweet)
:param error: Behaviour for encoding errors, see\
https://docs.python.org/3/library/codecs.html#codec-base-classes
:param gzip_compress: if `True`, ouput files are compressed with gzip
"""
(writer, outf) = outf_writer_compat(outfile, encoding, errors, gzip_compress)
header = get_header_field_list(main_fields, entity_type, entity_fields)
writer.writerow(header)
for line in tweets_file:
tweet = json.loads(line)
if _is_composed_key(entity_type):
key, value = _get_key_value_composed(entity_type)
object_json = _get_entity_recursive(tweet, key)
if not object_json:
# this can happen in the case of "place"
continue
object_fields = extract_fields(object_json, main_fields)
items = _get_entity_recursive(object_json, value)
_write_to_file(object_fields, items, entity_fields, writer)
else:
tweet_fields = extract_fields(tweet, main_fields)
items = _get_entity_recursive(tweet, entity_type)
_write_to_file(tweet_fields, items, entity_fields, writer)
outf.close()
def json2csv_entities(tweets_file, outfile, main_fields, entity_type, entity_fields,
encoding='utf8', errors='replace', gzip_compress=False):
"""
Extract selected fields from a file of line-separated JSON tweets and
write to a file in CSV format.
This utility function allows a file of full Tweets to be easily converted
to a CSV file for easier processing of Twitter entities. For example, the
hashtags or media elements of a tweet can be extracted.
It returns one line per entity of a Tweet, e.g. if a tweet has two hashtags
there will be two lines in the output file, one per hashtag
:param tweets_file: the file-like object containing full Tweets
:param str outfile: The path of the text file where results should be\
written
:param list main_fields: The list of fields to be extracted from the main\
object, usually the tweet. Useful examples: 'id_str' for the tweetID. See\
<https://dev.twitter.com/overview/api/tweets> for a full list of fields.
e. g.: ['id_str'], ['id', 'text', 'favorite_count', 'retweet_count']
If `entity_type` is expressed with hierarchy, then it is the list of\
fields of the object that corresponds to the key of the entity_type,\
(e.g., for entity_type='user.urls', the fields in the main_fields list\
belong to the user object; for entity_type='place.bounding_box', the\
files in the main_field list belong to the place object of the tweet).
:param list entity_type: The name of the entity: 'hashtags', 'media',\
'urls' and 'user_mentions' for the tweet object. For a user object,\
this needs to be expressed with a hierarchy: `'user.urls'`. For the\
bounding box of the Tweet location, use `'place.bounding_box'`.
:param list entity_fields: The list of fields to be extracted from the\
entity. E.g. `['text']` (of the Tweet)
:param error: Behaviour for encoding errors, see\
https://docs.python.org/3/library/codecs.html#codec-base-classes
:param gzip_compress: if `True`, ouput files are compressed with gzip
"""
(writer, outf) = outf_writer_compat(outfile, encoding, errors, gzip_compress)
header = get_header_field_list(main_fields, entity_type, entity_fields)
writer.writerow(header)
for line in tweets_file:
tweet = json.loads(line)
if _is_composed_key(entity_type):
key, value = _get_key_value_composed(entity_type)
object_json = _get_entity_recursive(tweet, key)
if not object_json:
# this can happen in the case of "place"
continue
object_fields = extract_fields(object_json, main_fields)
items = _get_entity_recursive(object_json, value)
_write_to_file(object_fields, items, entity_fields, writer)
else:
tweet_fields = extract_fields(tweet, main_fields)
items = _get_entity_recursive(tweet, entity_type)
_write_to_file(tweet_fields, items, entity_fields, writer)
outf.close()
def json2csv_entities(tweets_file, outfile, main_fields, entity_type, entity_fields,
encoding='utf8', errors='replace', gzip_compress=False):
"""
Extract selected fields from a file of line-separated JSON tweets and
write to a file in CSV format.
This utility function allows a file of full Tweets to be easily converted
to a CSV file for easier processing of Twitter entities. For example, the
hashtags or media elements of a tweet can be extracted.
It returns one line per entity of a Tweet, e.g. if a tweet has two hashtags
there will be two lines in the output file, one per hashtag
:param tweets_file: the file-like object containing full Tweets
:param str outfile: The path of the text file where results should be\
written
:param list main_fields: The list of fields to be extracted from the main\
object, usually the tweet. Useful examples: 'id_str' for the tweetID. See\
<https://dev.twitter.com/overview/api/tweets> for a full list of fields.
e. g.: ['id_str'], ['id', 'text', 'favorite_count', 'retweet_count']
If `entity_type` is expressed with hierarchy, then it is the list of\
fields of the object that corresponds to the key of the entity_type,\
(e.g., for entity_type='user.urls', the fields in the main_fields list\
belong to the user object; for entity_type='place.bounding_box', the\
files in the main_field list belong to the place object of the tweet).
:param list entity_type: The name of the entity: 'hashtags', 'media',\
'urls' and 'user_mentions' for the tweet object. For a user object,\
this needs to be expressed with a hierarchy: `'user.urls'`. For the\
bounding box of the Tweet location, use `'place.bounding_box'`.
:param list entity_fields: The list of fields to be extracted from the\
entity. E.g. `['text']` (of the Tweet)
:param error: Behaviour for encoding errors, see\
https://docs.python.org/3/library/codecs.html#codec-base-classes
:param gzip_compress: if `True`, ouput files are compressed with gzip
"""
(writer, outf) = outf_writer_compat(outfile, encoding, errors, gzip_compress)
header = get_header_field_list(main_fields, entity_type, entity_fields)
writer.writerow(header)
for line in tweets_file:
tweet = json.loads(line)
if _is_composed_key(entity_type):
key, value = _get_key_value_composed(entity_type)
object_json = _get_entity_recursive(tweet, key)
if not object_json:
# this can happen in the case of "place"
continue
object_fields = extract_fields(object_json, main_fields)
items = _get_entity_recursive(object_json, value)
_write_to_file(object_fields, items, entity_fields, writer)
else:
tweet_fields = extract_fields(tweet, main_fields)
items = _get_entity_recursive(tweet, entity_type)
_write_to_file(tweet_fields, items, entity_fields, writer)
outf.close()
def feeds(self):
"""List GitHub's timeline resources in Atom format.
:returns: dictionary parsed to include URITemplates
"""
def replace_href(feed_dict):
if not feed_dict:
return feed_dict
ret_dict = {}
# Let's pluck out what we're most interested in, the href value
href = feed_dict.pop('href', None)
# Then we update the return dictionary with the rest of the values
ret_dict.update(feed_dict)
if href is not None:
# So long as there is something to template, let's template it
ret_dict['href'] = URITemplate(href)
return ret_dict
url = self._build_url('feeds')
json = self._json(self._get(url), 200, include_cache_info=False)
if json is None: # If something went wrong, get out early
return None
# We have a response body to parse
feeds = {}
# Let's pop out the old links so we don't have to skip them below
old_links = json.pop('_links', {})
_links = {}
# If _links is in the response JSON, iterate over that and recreate it
# so that any templates contained inside can be turned into
# URITemplates
for key, value in old_links.items():
if isinstance(value, list):
# If it's an array/list of links, let's replace that with a
# new list of links
_links[key] = [replace_href(d) for d in value]
else:
# Otherwise, just use the new value
_links[key] = replace_href(value)
# Start building up our return dictionary
feeds['_links'] = _links
for key, value in json.items():
# This should roughly be the same logic as above.
if isinstance(value, list):
feeds[key] = [URITemplate(v) for v in value]
else:
feeds[key] = URITemplate(value)
return feeds
def json2csv_entities(tweets_file, outfile, main_fields, entity_type, entity_fields,
encoding='utf8', errors='replace', gzip_compress=False):
"""
Extract selected fields from a file of line-separated JSON tweets and
write to a file in CSV format.
This utility function allows a file of full Tweets to be easily converted
to a CSV file for easier processing of Twitter entities. For example, the
hashtags or media elements of a tweet can be extracted.
It returns one line per entity of a Tweet, e.g. if a tweet has two hashtags
there will be two lines in the output file, one per hashtag
:param tweets_file: the file-like object containing full Tweets
:param str outfile: The path of the text file where results should be\
written
:param list main_fields: The list of fields to be extracted from the main\
object, usually the tweet. Useful examples: 'id_str' for the tweetID. See\
<https://dev.twitter.com/overview/api/tweets> for a full list of fields.
e. g.: ['id_str'], ['id', 'text', 'favorite_count', 'retweet_count']
If `entity_type` is expressed with hierarchy, then it is the list of\
fields of the object that corresponds to the key of the entity_type,\
(e.g., for entity_type='user.urls', the fields in the main_fields list\
belong to the user object; for entity_type='place.bounding_box', the\
files in the main_field list belong to the place object of the tweet).
:param list entity_type: The name of the entity: 'hashtags', 'media',\
'urls' and 'user_mentions' for the tweet object. For a user object,\
this needs to be expressed with a hierarchy: `'user.urls'`. For the\
bounding box of the Tweet location, use `'place.bounding_box'`.
:param list entity_fields: The list of fields to be extracted from the\
entity. E.g. `['text']` (of the Tweet)
:param error: Behaviour for encoding errors, see\
https://docs.python.org/3/library/codecs.html#codec-base-classes
:param gzip_compress: if `True`, ouput files are compressed with gzip
"""
(writer, outf) = outf_writer_compat(outfile, encoding, errors, gzip_compress)
header = get_header_field_list(main_fields, entity_type, entity_fields)
writer.writerow(header)
for line in tweets_file:
tweet = json.loads(line)
if _is_composed_key(entity_type):
key, value = _get_key_value_composed(entity_type)
object_json = _get_entity_recursive(tweet, key)
if not object_json:
# this can happen in the case of "place"
continue
object_fields = extract_fields(object_json, main_fields)
items = _get_entity_recursive(object_json, value)
_write_to_file(object_fields, items, entity_fields, writer)
else:
tweet_fields = extract_fields(tweet, main_fields)
items = _get_entity_recursive(tweet, entity_type)
_write_to_file(tweet_fields, items, entity_fields, writer)
outf.close()
def json2csv_entities(tweets_file, outfile, main_fields, entity_type, entity_fields,
encoding='utf8', errors='replace', gzip_compress=False):
"""
Extract selected fields from a file of line-separated JSON tweets and
write to a file in CSV format.
This utility function allows a file of full Tweets to be easily converted
to a CSV file for easier processing of Twitter entities. For example, the
hashtags or media elements of a tweet can be extracted.
It returns one line per entity of a Tweet, e.g. if a tweet has two hashtags
there will be two lines in the output file, one per hashtag
:param tweets_file: the file-like object containing full Tweets
:param str outfile: The path of the text file where results should be\
written
:param list main_fields: The list of fields to be extracted from the main\
object, usually the tweet. Useful examples: 'id_str' for the tweetID. See\
<https://dev.twitter.com/overview/api/tweets> for a full list of fields.
e. g.: ['id_str'], ['id', 'text', 'favorite_count', 'retweet_count']
If `entity_type` is expressed with hierarchy, then it is the list of\
fields of the object that corresponds to the key of the entity_type,\
(e.g., for entity_type='user.urls', the fields in the main_fields list\
belong to the user object; for entity_type='place.bounding_box', the\
files in the main_field list belong to the place object of the tweet).
:param list entity_type: The name of the entity: 'hashtags', 'media',\
'urls' and 'user_mentions' for the tweet object. For a user object,\
this needs to be expressed with a hierarchy: `'user.urls'`. For the\
bounding box of the Tweet location, use `'place.bounding_box'`.
:param list entity_fields: The list of fields to be extracted from the\
entity. E.g. `['text']` (of the Tweet)
:param error: Behaviour for encoding errors, see\
https://docs.python.org/3/library/codecs.html#codec-base-classes
:param gzip_compress: if `True`, ouput files are compressed with gzip
"""
(writer, outf) = outf_writer_compat(outfile, encoding, errors, gzip_compress)
header = get_header_field_list(main_fields, entity_type, entity_fields)
writer.writerow(header)
for line in tweets_file:
tweet = json.loads(line)
if _is_composed_key(entity_type):
key, value = _get_key_value_composed(entity_type)
object_json = _get_entity_recursive(tweet, key)
if not object_json:
# this can happen in the case of "place"
continue
object_fields = extract_fields(object_json, main_fields)
items = _get_entity_recursive(object_json, value)
_write_to_file(object_fields, items, entity_fields, writer)
else:
tweet_fields = extract_fields(tweet, main_fields)
items = _get_entity_recursive(tweet, entity_type)
_write_to_file(tweet_fields, items, entity_fields, writer)
outf.close()
def json2csv_entities(tweets_file, outfile, main_fields, entity_type, entity_fields,
encoding='utf8', errors='replace', gzip_compress=False):
"""
Extract selected fields from a file of line-separated JSON tweets and
write to a file in CSV format.
This utility function allows a file of full Tweets to be easily converted
to a CSV file for easier processing of Twitter entities. For example, the
hashtags or media elements of a tweet can be extracted.
It returns one line per entity of a Tweet, e.g. if a tweet has two hashtags
there will be two lines in the output file, one per hashtag
:param tweets_file: the file-like object containing full Tweets
:param str outfile: The path of the text file where results should be\
written
:param list main_fields: The list of fields to be extracted from the main\
object, usually the tweet. Useful examples: 'id_str' for the tweetID. See\
<https://dev.twitter.com/overview/api/tweets> for a full list of fields.
e. g.: ['id_str'], ['id', 'text', 'favorite_count', 'retweet_count']
If `entity_type` is expressed with hierarchy, then it is the list of\
fields of the object that corresponds to the key of the entity_type,\
(e.g., for entity_type='user.urls', the fields in the main_fields list\
belong to the user object; for entity_type='place.bounding_box', the\
files in the main_field list belong to the place object of the tweet).
:param list entity_type: The name of the entity: 'hashtags', 'media',\
'urls' and 'user_mentions' for the tweet object. For a user object,\
this needs to be expressed with a hierarchy: `'user.urls'`. For the\
bounding box of the Tweet location, use `'place.bounding_box'`.
:param list entity_fields: The list of fields to be extracted from the\
entity. E.g. `['text']` (of the Tweet)
:param error: Behaviour for encoding errors, see\
https://docs.python.org/3/library/codecs.html#codec-base-classes
:param gzip_compress: if `True`, ouput files are compressed with gzip
"""
(writer, outf) = outf_writer_compat(outfile, encoding, errors, gzip_compress)
header = get_header_field_list(main_fields, entity_type, entity_fields)
writer.writerow(header)
for line in tweets_file:
tweet = json.loads(line)
if _is_composed_key(entity_type):
key, value = _get_key_value_composed(entity_type)
object_json = _get_entity_recursive(tweet, key)
if not object_json:
# this can happen in the case of "place"
continue
object_fields = extract_fields(object_json, main_fields)
items = _get_entity_recursive(object_json, value)
_write_to_file(object_fields, items, entity_fields, writer)
else:
tweet_fields = extract_fields(tweet, main_fields)
items = _get_entity_recursive(tweet, entity_type)
_write_to_file(tweet_fields, items, entity_fields, writer)
outf.close()