common.py 文件源码

python
阅读 27 收藏 0 点赞 0 评论 0

项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码 文件源码
def json2csv_entities(tweets_file, outfile, main_fields, entity_type, entity_fields,
                      encoding='utf8', errors='replace', gzip_compress=False):
    """
    Extract selected fields from a file of line-separated JSON tweets and
    write to a file in CSV format.

    This utility function allows a file of full Tweets to be easily converted
    to a CSV file for easier processing of Twitter entities. For example, the
    hashtags or media elements of a tweet can be extracted.

    It returns one line per entity of a Tweet, e.g. if a tweet has two hashtags
    there will be two lines in the output file, one per hashtag

    :param tweets_file: the file-like object containing full Tweets

    :param str outfile: The path of the text file where results should be\
    written

    :param list main_fields: The list of fields to be extracted from the main\
    object, usually the tweet. Useful examples: 'id_str' for the tweetID. See\
    <https://dev.twitter.com/overview/api/tweets> for a full list of fields.
    e. g.: ['id_str'], ['id', 'text', 'favorite_count', 'retweet_count']
    If `entity_type` is expressed with hierarchy, then it is the list of\
    fields of the object that corresponds to the key of the entity_type,\
    (e.g., for entity_type='user.urls', the fields in the main_fields list\
    belong to the user object; for entity_type='place.bounding_box', the\
    files in the main_field list belong to the place object of the tweet).

    :param list entity_type: The name of the entity: 'hashtags', 'media',\
    'urls' and 'user_mentions' for the tweet object. For a user object,\
    this needs to be expressed with a hierarchy: `'user.urls'`. For the\
    bounding box of the Tweet location, use `'place.bounding_box'`.

    :param list entity_fields: The list of fields to be extracted from the\
    entity. E.g. `['text']` (of the Tweet)

    :param error: Behaviour for encoding errors, see\
    https://docs.python.org/3/library/codecs.html#codec-base-classes

    :param gzip_compress: if `True`, ouput files are compressed with gzip
    """

    (writer, outf) = outf_writer_compat(outfile, encoding, errors, gzip_compress)
    header = get_header_field_list(main_fields, entity_type, entity_fields)
    writer.writerow(header)
    for line in tweets_file:
        tweet = json.loads(line)
        if _is_composed_key(entity_type):
            key, value = _get_key_value_composed(entity_type)
            object_json = _get_entity_recursive(tweet, key)
            if not object_json:
                # this can happen in the case of "place"
                continue
            object_fields = extract_fields(object_json, main_fields)
            items = _get_entity_recursive(object_json, value)
            _write_to_file(object_fields, items, entity_fields, writer)
        else:
            tweet_fields = extract_fields(tweet, main_fields)
            items = _get_entity_recursive(tweet, entity_type)
            _write_to_file(tweet_fields, items, entity_fields, writer)
    outf.close()
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号