net_inetSourceAnalysis.py 文件源码-python代码片段

def _urlQuery(self, urlInput):
        httplib2.debuglevel = 4          

        url = "http://urlquery.net/%s"
        action_search = url % "search.php?q=%s" % urlInput

        conn = urllib2.urlopen(action_search, timeout=60)
        content2String = conn.read()      

        rpd = re.compile('.*&nbsp;&nbsp;0\sresults\sreturned*', re.IGNORECASE)
        rpdFind = re.findall(rpd, content2String)

        if not rpdFind:
            # Reports found
            log.debug('urlquery Reports found')
            self.hitcount += 1
            urlqueryResults = []

            rpd = re.compile("\shref='(.*?)'\>", re.IGNORECASE)
            rpdFindReport = re.findall(rpd, content2String)

            rpd = re.compile("\<td\>\<a\stitle='(.*?)'\shref='report.php", re.IGNORECASE)
            rpdFindReportUrl = re.findall(rpd, content2String)               

            rpd = re.compile("\<td\salign='center'\>\<b\>(.*?)\<\/b\>\<\/td\>", re.IGNORECASE)
            rpdFindAlertsIDS = re.findall(rpd, content2String)    

            rpd = re.compile("\<td\>\<nobr\>\<center\>(.*?)\<\/center\>\<\/nobr\>\<\/td\>", re.IGNORECASE)
            rpdFindDatum = re.findall(rpd, content2String)    

            rpd = re.compile("align='left'\stitle='(.*?)'\swidth='\d{2}'\sheight='\d{2}'\s/>", re.IGNORECASE)
            rpdFindLand = re.findall(rpd, content2String)   

            i = 0
            datum = ''
            for datum in rpdFindDatum:   
                result = {} 
                result["datum"] = datum    
                result["alerts_ids"] = rpdFindAlertsIDS[i]
                result["country"] = rpdFindLand[i]
                result["reportUrl"] = convertDirtyDict2ASCII(rpdFindReportUrl[i])
                result["report"] = url % rpdFindReport[i]   
                urlqueryResults.append(result)                   
                i += 1             

            urlquery = {'url':urlInput, 'urlResult':urlqueryResults}
        else:   
            log.debug('urlquery Reports NOT found')  
            urlquery = {'url': urlInput, 'urlResult' : 'NOT listed'}     

        return urlquery