python类punctuation()的实例源码

test_bag_of_features.py 文件源码 项目:xpandas 作者: alan-turing-institute 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def test_bag_of_words_for_series():
    dataset = fetch_20newsgroups(shuffle=True, random_state=1,
                                 remove=('headers', 'footers', 'quotes'))

    series = XSeries(dataset.data[:10])
    assert series.data_type == str

    translator = str.maketrans('', '', string.punctuation)
    tokenizer_transformer = XSeriesTransformer(
        transform_function=lambda text: text.lower().translate(translator).strip().split()
    )

    transformed_series = tokenizer_transformer.fit_transform(series)
    # print(transformed_series)

    bag_transform = BagOfWordsTransformer()

    transformed_series = bag_transform.fit_transform(transformed_series)

    # print(transformed_series)

    assert type(transformed_series) == XDataFrame
util.py 文件源码 项目:lang-reps 作者: chaitanyamalaviya 项目源码 文件源码 阅读 50 收藏 0 点赞 0 评论 0
def __iter__(self):
        """
        Read a file where each line is of the form "word1 word2 ..."
        Yields lists of the form [word1, word2, ...]
        """
        if os.path.isdir(self.fname):
            filenames = [os.path.join(self.fname,f) for f in os.listdir(self.fname)]
        else:
            filenames = [self.fname]
        for filename in filenames:
            # with io.open(filename, encoding='utf-8') as f:
            with open(filename) as f:
                doc = f.read()
                for line in doc.split("\n"):
                    #if not line:  continue
                    sent = "".join([ch for ch in line.lower() if ch not in string.punctuation]).strip().split()
                    # sent = [word for word in line.strip().split()]
                    sent = [self.begin] + sent + [self.end]
                    yield sent
util.py 文件源码 项目:lang-reps 作者: chaitanyamalaviya 项目源码 文件源码 阅读 32 收藏 0 点赞 0 评论 0
def __iter__(self):
        """
        Read a file where each line is of the form "word1 word2 ..."
        Yields lists of the form [word1, word2, ...]
        """
        #jfbbb
    if os.path.isdir(self.fname):
            filenames = [os.path.join(self.fname,f) for f in os.listdir(self.fname)]
        #else:
        #    filenames = [self.fname]

        for langpath in filenames:
            with open(filename) as f:
                doc = f.read()
                for line in doc.split("\n"):
                    #if not line:  continue
                    sent = "".join([ch for ch in line.lower() if ch not in string.punctuation]).strip().split()
                    # sent = [word for word in line.strip().split()]
                    sent = [self.begin] + sent + [self.end]
                    yield sent
Utils.py 文件源码 项目:newsreap 作者: caronc 项目源码 文件源码 阅读 38 收藏 0 点赞 0 评论 0
def hexdump(src, length=16, sep='.'):
    """
    Displays a hex output of the content it is passed.

    This was based on https://gist.github.com/7h3rAm/5603718 with some
    minor modifications
    """
    allowed = digits + ascii_letters + punctuation + ' '

    print_map = ''.join(((x if x in allowed else '.')
                        for x in map(chr, range(256))))
    lines = []

    for c in xrange(0, len(src), length):
        chars = src[c:c + length]
        hex = ' '.join(["%02x" % ord(x) for x in chars])
        if len(hex) > 24:
            hex = "%s %s" % (hex[:24], hex[24:])
        printable = ''.join(["%s" % (
            (ord(x) <= 127 and print_map[ord(x)]) or sep) for x in chars])
        lines.append("%08x:  %-*s  |%s|" % (c, length * 3, hex, printable))
    return '\n'.join(lines)
pyflooder.py 文件源码 项目:PyFlooder 作者: D4Vinci 项目源码 文件源码 阅读 32 收藏 0 点赞 0 评论 0
def attack():

    ip = socket.gethostbyname( host )
    global n
    msg=str(string.letters+string.digits+string.punctuation)
    data="".join(random.sample(msg,5))
    dos = socket.socket(socket.AF_INET, socket.SOCK_STREAM)

    try:
        n+=1
        dos.connect((ip, port))
        dos.send( "GET /%s HTTP/1.1\r\n" % data )
        print "\n "+time.ctime().split(" ")[3]+" "+"["+str(n)+"] #-#-# Hold Your Tears #-#-#"

    except socket.error:
        print "\n [ No connection! Server maybe down ] "

    dos.close()
SentenceComparator.py 文件源码 项目:scientific-paper-summarisation 作者: EdCo95 项目源码 文件源码 阅读 35 收藏 0 点赞 0 评论 0
def removeCommonWords(self, sentence, common_words, tokenized=False):
        """Takes a sentence and list of stopwords and removes the stopwords from the sentence."""
        if not tokenized:
            words = sentence.split(' ')
        else:
            words = sentence
        final_sentence = []

        for word in words:
            word = word.translate(string.maketrans("", ""), string.punctuation)
            word = word.lower()
            if word in common_words:
                continue
            else:
                final_sentence.append(word)

        return final_sentence
recipe-578397.py 文件源码 项目:code 作者: ActiveState 项目源码 文件源码 阅读 35 收藏 0 点赞 0 评论 0
def ex3(argv):

    password = ''

    for i in range(len(argv)):
        for j in range(int(argv[i])):
            if i == 0:
                password += string.uppercase[random.randint(0,len(string.uppercase)-1)]
            elif i == 1:
                password += string.lowercase[random.randint(0,len(string.lowercase)-1)]
            elif i == 2:
                password += string.digits[random.randint(0,len(string.digits)-1)]
            elif i == 3:
                password += string.punctuation[random.randint(0,len(string.punctuation)-1)]

    return ''.join(random.sample(password,len(password)))
movie_dataset.py 文件源码 项目:CopyNet 作者: MultiPath 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def mark(line):
    tmp_line = ''
    for c in line:
        if c in string.punctuation:
            if c is not "'":
                tmp_line += ' ' + c + ' '
            else:
                tmp_line += ' ' + c
        else:
            tmp_line += c
    tmp_line = tmp_line.lower()
    words = [w for w in tmp_line.split() if len(w) > 0]
    for w in words:
        if w not in word2freq:
            word2freq[w]  = 1
        else:
            word2freq[w] += 1
    return words
utils.py 文件源码 项目:deeppavlov 作者: deepmipt 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def _normalize_answer(s):
    """Normalize string to score answers according to SQuAD dataset scoring rules.

    Remove articles, remove punctuation, fix multiple whitespaces in string, and convert all characters to lowercase.
    """

    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))
preprocessing.py 文件源码 项目:KATE 作者: hugochan 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def tiny_tokenize(text, stem=False, stop_words=[]):
    words = []
    for token in wordpunct_tokenize(re.sub('[%s]' % re.escape(string.punctuation), ' ', \
            text.decode(encoding='UTF-8', errors='ignore'))):
        if not token.isdigit() and not token in stop_words:
            if stem:
                try:
                    w = EnglishStemmer().stem(token)
                except Exception as e:
                    w = token
            else:
                w = token
            words.append(w)

    return words

    # return [EnglishStemmer().stem(token) if stem else token for token in wordpunct_tokenize(
    #                     re.sub('[%s]' % re.escape(string.punctuation), ' ', text.decode(encoding='UTF-8', errors='ignore'))) if
    #                     not token.isdigit() and not token in stop_words]
extract_values.py 文件源码 项目:regex_extraction 作者: aforsyth 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def _extract_values_from_rpdr_notes(
        rpdr_notes, phrase_type, phrases, ignore_punctuation,
        show_n_words_context_before, show_n_words_context_after):
    """Return a list of NotePhraseMatches for each note in rpdr_notes."""
    note_phrase_matches = []
    if ignore_punctuation:
        logging.info('ignore_punctuation is True, so we will also ignore '
                     'any punctuation in the entered phrases.')
        phrases = [_remove_punctuation(phrase) for phrase in phrases]
    match_contexts = PhraseMatchContexts(
        show_n_words_context_before, show_n_words_context_after)
    for rpdr_note in rpdr_notes:
        if ignore_punctuation:
            rpdr_note.remove_punctuation_from_note()
        phrase_matches = _extract_phrase_from_notes(phrase_type, phrases,
                                                    rpdr_note, match_contexts)
        note_phrase_matches.append(phrase_matches)
    match_contexts.print_ordered_contexts()
    return note_phrase_matches
vadersentiment.py 文件源码 项目:crypto-sentiment 作者: codingupastorm 项目源码 文件源码 阅读 42 收藏 0 点赞 0 评论 0
def _words_plus_punc(self):
        """
        Returns mapping of form:
        {
            'cat,': 'cat',
            ',cat': 'cat',
        }
        """
        no_punc_text = REGEX_REMOVE_PUNCTUATION.sub('', self.text)
        # removes punctuation (but loses emoticons & contractions)
        words_only = no_punc_text.split()
        # remove singletons
        words_only = set( w for w in words_only if len(w) > 1 )
        # the product gives ('cat', ',') and (',', 'cat')
        punc_before = {''.join(p): p[1] for p in product(PUNC_LIST, words_only)}
        punc_after = {''.join(p): p[0] for p in product(words_only, PUNC_LIST)}
        words_punc_dict = punc_before
        words_punc_dict.update(punc_after)
        return words_punc_dict
irc.py 文件源码 项目:hostapd-mana 作者: adde88 项目源码 文件源码 阅读 32 收藏 0 点赞 0 评论 0
def ping(self, user, text = None):
        """Measure round-trip delay to another IRC client.
        """
        if self._pings is None:
            self._pings = {}

        if text is None:
            chars = string.letters + string.digits + string.punctuation
            key = ''.join([random.choice(chars) for i in range(12)])
        else:
            key = str(text)
        self._pings[(user, key)] = time.time()
        self.ctcpMakeQuery(user, [('PING', key)])

        if len(self._pings) > self._MAX_PINGRING:
            # Remove some of the oldest entries.
            byValue = [(v, k) for (k, v) in self._pings.items()]
            byValue.sort()
            excess = self._MAX_PINGRING - len(self._pings)
            for i in xrange(excess):
                del self._pings[byValue[i][1]]
NewsAutosummarize.py 文件源码 项目:Python-Scripts-Repo-on-Data-Science 作者: qalhata 项目源码 文件源码 阅读 41 收藏 0 点赞 0 评论 0
def _init_(self, min_cut=0.1, max_cut=0.9):
        # identation changes - we are inside the constructor
        # here we set up the behaviour
        # this is called each time an object of feq summ class is
        # created or instantiated
        self._min_cut = min_cut    # self=keyword that reports the variable
        self._max_cut = max_cut
        # we save the val of the 2 parameters passed by assigning them
        # two member variables - the 'self.' prefix identifies them as part
        # of the self argument - using underscore as first char.
        self._stopwords = set(stopwords.words('english') + list(punctuation))
        # this is alist of all common words and punc symols

    # identation changes - we are out of the constructor here
    # This is still the body of the class
    # Defining var here ( outside a member function) but within the class
    # member var becomes STATIC. This means it belongs to the class, and not
    # to any specific individual instance (object) of the class
squad_eval.py 文件源码 项目:allennlp 作者: allenai 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))
clean_text.py 文件源码 项目:glassdoor-analysis 作者: THEdavehogue 项目源码 文件源码 阅读 36 收藏 0 点赞 0 评论 0
def lemmatize_text(text, stop_words=STOPLIST, keep_pos=KEEP_POS):
    '''
    Function to lemmatize a single document of the corpus

    INPUT:
        text: string, text of review
        stop_words: words to remove from text, default STOPLIST defined above
        keep_pos: parts of speech to keep in text, default KEEP_POS def above

    OUTPUT:
        lemmatized text
    '''
    x = nlp(text)
    words = [tok.lemma_.strip(punctuation) for tok in x if (
        tok.pos_ in keep_pos) and (tok.lemma_.strip(punctuation) not in STOPLIST)]
    words.extend(['boss' for tok in x if tok.lemma_ == 'bos'])
    return ' '.join(words)
common.py 文件源码 项目:python-tutorial 作者: Akuli 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def header_link(title):
    """Return a github-style link target for a title.

    >>> header_link('Hello there!')
    'hello-there'
    """
    # This doesn't do the-title-1, the-title-2 etc. with multiple titles
    # with same text, but usually this doesn't matter.
    result = ''
    for character in title:
        if character in string.whitespace:
            result += '-'
        elif character in string.punctuation:
            pass
        else:
            result += character.lower()
    return result
common.py 文件源码 项目:python-tutorial 作者: Akuli 项目源码 文件源码 阅读 31 收藏 0 点赞 0 评论 0
def header_link(title):
    """Return a github-style link target for a title.

    >>> header_link('Hello there!')
    'hello-there'
    """
    # This doesn't handle multiple titles with the same text in the
    # same file, but usually that's not a problem. GitHub makes
    # links like the-title, the-title-1, the-title-2 etc.
    result = ''
    for character in title:
        if character in string.whitespace:
            result += '-'
        elif character in string.punctuation:
            pass
        else:
            result += character.lower()
    return result
Mods.py 文件源码 项目:Luna 作者: Moonlington 项目源码 文件源码 阅读 31 收藏 0 点赞 0 评论 0
def _bots(self, ctx, amount: int=100):
        """Clears bots and bot calls."""
        def check(m):
            if m.author.bot:
                return True
            for mem in m.mentions:
                if mem.bot:
                    return True
            if m.content.startswith(tuple(i for i in string.punctuation)) and not bool(re.search(r'^<@!?(\d+)>', m.content)):
                return True
            return False
        messages = await self.bot.purge_from(ctx.message.channel, limit=amount, before=ctx.message, check=check)
        await self.bot.delete_message(ctx.message)
        send = await self.bot.say("Successfully cleared **{}** messages".format(len(messages)))
        await asyncio.sleep(3)
        await self.bot.delete_message(send)
person.py 文件源码 项目:ChemDataExtractor 作者: mcs07 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def could_be(self, other):
        """Return True if the other PersonName is not explicitly inconsistent."""
        # TODO: Some suffix and title differences should be allowed
        if type(other) is not type(self):
            return NotImplemented
        if self == other:
            return True
        for attr in ['title', 'firstname', 'middlename', 'nickname', 'prefix', 'lastname', 'suffix']:
            if attr not in self or attr not in other:
                continue
            puncmap = dict((ord(char), None) for char in string.punctuation)
            s = self[attr].lower().translate(puncmap)
            o = other[attr].lower().translate(puncmap)
            if s == o:
                continue
            if attr in {'firstname', 'middlename', 'lastname'}:
                if (({len(comp) for comp in s.split()} == {1} and [el[0] for el in o.split()] == s.split()) or
                        ({len(comp) for comp in o.split()} == {1} and [el[0] for el in s.split()] == o.split())):
                    continue
            return False
        return True
Sentiment Analysis.py 文件源码 项目:Twitter-Sentiment-Analysis-For-Birthday-Celebrities 作者: vishal-tiwari 项目源码 文件源码 阅读 33 收藏 0 点赞 0 评论 0
def posNegCount(self, tweet):

        pos = 0
        neg = 0

        for p in list(punctuation):
            tweet = tweet.replace(p, '')

        tweet = tweet.lower()
        words = tweet.split(' ')
        word_count = len(words)

        for word in words:
            if word in self.positive_words:
                pos = pos + 1
            elif word in self.negative_words:
                neg = neg + 1

        return pos, neg
extractive_qa_eval.py 文件源码 项目:jack 作者: uclmr 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""

    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))
lemmatiser.py 文件源码 项目:LDA-REST 作者: valentinarho 项目源码 文件源码 阅读 32 收藏 0 点赞 0 评论 0
def LemNormalize(text):
    # convert non ascii characters
    text = text.encode('ascii', 'replace').decode()
    # remove punctuation and digits
    remove_punct_and_digits = dict([(ord(punct), ' ') for punct in string.punctuation + string.digits])
    transformed = text.lower().translate(remove_punct_and_digits)
    # shortword = re.compile(r'\W*\b\w{1,2}\b')
    # transformed = shortword.sub('', transformed)

    # tokenize the transformed string
    tokenized = nltk.word_tokenize(transformed)

    # remove short words (less than 3 char)
    tokenized = [w for w in tokenized if len(w) > 3]
    tokenizer = LemTokens(tokenized)

    return tokenizer
lemmatiser.py 文件源码 项目:LDA-REST 作者: valentinarho 项目源码 文件源码 阅读 33 收藏 0 点赞 0 评论 0
def LemNormalizeIt(text):

    # convert non ascii characters
    text = text.encode('ascii', 'replace').decode()
    # remove punctuation and digits
    remove_punct_and_digits = dict([(ord(punct), ' ') for punct in string.punctuation + string.digits])
    transformed = text.lower().translate(remove_punct_and_digits)

    # tokenize the transformed string
    tokenized = nltk.word_tokenize(transformed)

    # apply lemming with morph it
    morph_it = load_morph_it()
    tokenized = [morph_it.get(w, w) for w in tokenized if len(w) > 3]

    return tokenized
utils.py 文件源码 项目:LDA_RecEngine 作者: easonchan1213 项目源码 文件源码 阅读 31 收藏 0 点赞 0 评论 0
def preprocessing(content):
    remove_punc = ('? ? ? ? ? ? ? ? ? —').split(' ')
    ## preprocessing #1 : remove XXenglishXX and numbers
    preprocessing_1 = re.compile(r'\d*',re.L)  ## only substitute numbers
    #preprocessing_1 = re.compile(r'\w*',re.L)  ## substitute number & English
    content = preprocessing_1.sub("",content)
    ## preprocessing #2 : remove punctuation
    preprocessing_2 = re.compile('[%s]' % re.escape(string.punctuation))
    content = preprocessing_2.sub("",content)
    ## preprocessing #3 : remove Chinese punctuation and multiple whitspaces
    content = content.replace('\n','')
    for punc in remove_punc:
        content = content.replace(punc,'')
    try:
        content = parsing.strip_multiple_whitespaces(content)
    except:
        print 'Warning : failed to strip whitespaces @ '   

    return content
anilist_search_methods.py 文件源码 项目:tsubasa-reddit-bot 作者: ArmandSyah 项目源码 文件源码 阅读 33 收藏 0 点赞 0 评论 0
def get_anilist_links(title):
    """Iterates through all search methods until link is constructed"""
    exclude = set(string.punctuation)
    title = ''.join(ch for ch in title if ch not in exclude)
    title = title.lower().split(' ')
    if 'season' in title:
        title.remove('season')
    title = ' '.join(title)
    anilist_regex = re.compile(r'http(s)?://anilist.co/anime/([0-9]){1,5}(/.*)?')
    link_dispatcher = {'api': _get_anilist_link_by_api}

    for _, v in link_dispatcher.items():
        anilist_url = v(title)
        if anilist_url is None:
            continue
        if re.match(anilist_regex, anilist_url) is not None:
            return anilist_url

    return
stream_search_methods.py 文件源码 项目:tsubasa-reddit-bot 作者: ArmandSyah 项目源码 文件源码 阅读 34 收藏 0 点赞 0 评论 0
def search_crunchyroll(anime):
    """Searches if anime exists on Crunchyroll and returns a link"""
    try:
        exclude = set(string.punctuation)
        anime = ''.join(ch for ch in anime if ch not in exclude)
        keywords = anime.split(' ')
        crunchy_api = MetaApi()
        crunchyroll_listing = []
        while len(keywords) > 0:
            crunchyroll_listing = list(crunchy_api.search_anime_series(' '.join(keywords)))
            if len(crunchyroll_listing) <= 0:
                print('No crunchyroll listings found')
                keywords.pop()
                continue
            else:
                break
    except:
        print('Crunchyroll url couldn\'t be retrieved')
        return

    return crunchyroll_listing[0].url if len(crunchyroll_listing) > 0 else None
stream_search_methods.py 文件源码 项目:tsubasa-reddit-bot 作者: ArmandSyah 项目源码 文件源码 阅读 33 收藏 0 点赞 0 评论 0
def search_funimation(anime):
    """Checks if anime exists on Funimation website and returns a link"""
    try:
        exclude = set(string.punctuation)
        anime = ''.join(ch for ch in anime if ch not in exclude)
        keywords = anime.split(' ')
        funi_url = None
        while len(keywords) > 0:
            show_slug = '-'.join(keywords).lower()
            funi_url = f'https://www.funimation.com/shows/{show_slug}/'
            funi_url = utilities.make_get_request(funi_url)
            if funi_url is None:
                keywords.pop()
                continue
            else:
                break
    except:
        print('Funimation url couldn\'t be retrieved')
        return
    return funi_url.url if funi_url is not None else None
stream_search_methods.py 文件源码 项目:tsubasa-reddit-bot 作者: ArmandSyah 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def search_animelab(anime):
    """Checks if anime title exists on AnimeLab website and returns a link"""
    try:
        exclude = set(string.punctuation)
        anime = ''.join(ch for ch in anime if ch not in exclude)
        keywords = anime.split(' ')
        animelab_url = None
        while len(keywords) > 0:
            show_slug = '-'.join(keywords).lower()
            animelab_url = f'https://www.animelab.com/shows/{show_slug}'
            animelab_url = utilities.make_get_request(animelab_url)
            if animelab_url is None:
                keywords.pop()
                return
            else:
                break
    except:
        print('Animelab url couldn\'t be retrieved')
        return
    return animelab_url.url
__init__.py 文件源码 项目:dupandas 作者: shivam5992 项目源码 文件源码 阅读 39 收藏 0 点赞 0 评论 0
def __init__(self, clean_config = None):
        self.cc = {
            'lower' : False,
            'punctuation' : False,
            'whitespace' : False,
            'digit' : False,
        }

        # Override clean config and validation check
        if clean_config != None:
            for key, value in clean_config.iteritems():
                if key in self.cc:
                    if value not in [True, False,1,0]:
                        print ("Invalid: Incorrect boolean value: "+str(value)+" for key: " + str(key))
                    else:
                        self.cc[key] = value
                else:
                    print ("Invalid: Cleaner not recognized: " + str(key) + ", available Cleaners: " +
                                                                     ", ".join(self.cc.keys()))

        cleaners_applied = [key for key in self.cc if self.cc[key]]
        if cleaners_applied:
            print ("Applying Cleaners: " + ", ".join(cleaners_applied))
        else:
            print ("Warning: No cleaners in config")


问题


面经


文章

微信
公众号

扫码关注公众号