def __init__(self):
self.articles = []
self.query = None
self.cjar = MozillaCookieJar()
# If we have a cookie file, load it:
if ScholarConf.COOKIE_JAR_FILE and \
os.path.exists(ScholarConf.COOKIE_JAR_FILE):
try:
self.cjar.load(ScholarConf.COOKIE_JAR_FILE,
ignore_discard=True)
ScholarUtils.log('info', 'loaded cookies file')
except Exception as msg:
ScholarUtils.log('warn', 'could not load cookies file: %s' % msg)
self.cjar = MozillaCookieJar() # Just to be safe
self.opener = build_opener(HTTPCookieProcessor(self.cjar))
self.settings = None # Last settings object, if any
python类MozillaCookieJar()的实例源码
def __init__(self):
self.articles = []
self.query = None
self.cjar = MozillaCookieJar()
# If we have a cookie file, load it:
if ScholarConf.COOKIE_JAR_FILE and \
os.path.exists(ScholarConf.COOKIE_JAR_FILE):
try:
self.cjar.load(ScholarConf.COOKIE_JAR_FILE,
ignore_discard=True)
ScholarUtils.log('info', 'loaded cookies file')
except Exception as msg:
ScholarUtils.log('warn', 'could not load cookies file: %s' % msg)
self.cjar = MozillaCookieJar() # Just to be safe
self.opener = build_opener(HTTPCookieProcessor(self.cjar))
self.settings = None # Last settings object, if any
def urlopen_test(host):
headers = [('Host',host),
('Connection', 'keep-alive'),
('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'),
('User-Agent', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0'),
#('Accept-Encoding','gzip,deflate'),
('Accept-Language', 'en-US,en;q=0.5')]
#????MozillaCookieJar???????cookie
cookie=cookielib.MozillaCookieJar()
handler=urllib2.HTTPCookieProcessor(cookie)
req=urllib2.Request(u'https://'+host)
first_opener = urllib2.build_opener(handler)
first_opener.addheaders = headers
try:
result=first_opener.open(req,timeout=60) #60s??
if result.read()!=None:
return True
except Exception,e:
print e
return False
def test_bad_magic(self):
from cookielib import LWPCookieJar, MozillaCookieJar, LoadError
# IOErrors (eg. file doesn't exist) are allowed to propagate
filename = test_support.TESTFN
for cookiejar_class in LWPCookieJar, MozillaCookieJar:
c = cookiejar_class()
try:
c.load(filename="for this test to work, a file with this "
"filename should not exist")
except IOError, exc:
# exactly IOError, not LoadError
self.assertEqual(exc.__class__, IOError)
else:
self.fail("expected IOError for invalid filename")
# Invalid contents of cookies file (eg. bad magic string)
# causes a LoadError.
try:
f = open(filename, "w")
f.write("oops\n")
for cookiejar_class in LWPCookieJar, MozillaCookieJar:
c = cookiejar_class()
self.assertRaises(LoadError, c.load, filename)
finally:
try: os.unlink(filename)
except OSError: pass
def test_bad_magic(self):
from cookielib import LWPCookieJar, MozillaCookieJar, LoadError
# IOErrors (eg. file doesn't exist) are allowed to propagate
filename = test_support.TESTFN
for cookiejar_class in LWPCookieJar, MozillaCookieJar:
c = cookiejar_class()
try:
c.load(filename="for this test to work, a file with this "
"filename should not exist")
except IOError, exc:
# exactly IOError, not LoadError
self.assertEqual(exc.__class__, IOError)
else:
self.fail("expected IOError for invalid filename")
# Invalid contents of cookies file (eg. bad magic string)
# causes a LoadError.
try:
f = open(filename, "w")
f.write("oops\n")
for cookiejar_class in LWPCookieJar, MozillaCookieJar:
c = cookiejar_class()
self.assertRaises(LoadError, c.load, filename)
finally:
try: os.unlink(filename)
except OSError: pass
medium_crawler.py 文件源码
项目:Medium-crawler-with-data-analyzer
作者: lifei96
项目源码
文件源码
阅读 20
收藏 0
点赞 0
评论 0
def get_following(user_id):
url = 'https://medium.com/_/api/users/' + user_id + '/following'
cj = cookielib.MozillaCookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
req = urllib2.Request(url)
req.add_header("User-agent", 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/50.0.2661.102 Safari/537.36')
response = opener.open(req, timeout=10)
data = response.read()
following = re.findall('"username":"(.*?)","createdAt"', data)
following_set = set(following)
to = re.findall('"to":"(.*?)"}}},"v"', data)
while to:
url = 'https://medium.com/_/api/users/' + user_id + '/following?to=' + to[0]
cj = cookielib.MozillaCookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
req = urllib2.Request(url)
req.add_header("User-agent", 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/50.0.2661.102 Safari/537.36')
response = opener.open(req, timeout=10)
data = response.read()
following = re.findall('"username":"(.*?)","createdAt"', data)
following_set.update(following)
to = re.findall('"to":"(.*?)"}}},"v"', data)
return list(following_set)
medium_crawler.py 文件源码
项目:Medium-crawler-with-data-analyzer
作者: lifei96
项目源码
文件源码
阅读 19
收藏 0
点赞 0
评论 0
def get_followers(user_id):
url = 'https://medium.com/_/api/users/' + user_id + '/followers'
cj = cookielib.MozillaCookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
req = urllib2.Request(url)
req.add_header("User-agent", 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/50.0.2661.102 Safari/537.36')
response = opener.open(req, timeout=10)
data = response.read()
followers = re.findall('"username":"(.*?)","createdAt"', data)
followers_set = set(followers)
to = re.findall('"to":"(.*?)"}}},"v"', data)
while to:
url = 'https://medium.com/_/api/users/' + user_id + '/followers?to=' + to[0]
cj = cookielib.MozillaCookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
req = urllib2.Request(url)
req.add_header("User-agent", 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/50.0.2661.102 Safari/537.36')
response = opener.open(req, timeout=10)
data = response.read()
followers = re.findall('"username":"(.*?)","createdAt"', data)
followers_set.update(followers)
to = re.findall('"to":"(.*?)"}}},"v"', data)
return list(followers_set)
medium_crawler.py 文件源码
项目:Medium-crawler-with-data-analyzer
作者: lifei96
项目源码
文件源码
阅读 21
收藏 0
点赞 0
评论 0
def get_latest(user_id):
url = 'https://medium.com/_/api/users/' + user_id + '/profile/stream?source=latest'
cj = cookielib.MozillaCookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
req = urllib2.Request(url)
req.add_header("User-agent", 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/50.0.2661.102 Safari/537.36')
response = opener.open(req, timeout=10)
data = response.read()
latest = re.findall('"postId":"(.*?)"},"randomId"', data)
latest_set = set(latest)
to = re.findall('"to":"(.*?)","source":"latest"', data)
while to:
url = 'https://medium.com/_/api/users/' + user_id + '/profile/stream?source=latest&to=' + to[0]
cj = cookielib.MozillaCookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
req = urllib2.Request(url)
req.add_header("User-agent", 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/50.0.2661.102 Safari/537.36')
response = opener.open(req, timeout=10)
data = response.read()
latest = re.findall('"postId":"(.*?)"},"randomId"', data)
latest_set.update(latest)
to = re.findall('"to":"(.*?)","source":"latest"', data)
return list(latest_set)
medium_crawler.py 文件源码
项目:Medium-crawler-with-data-analyzer
作者: lifei96
项目源码
文件源码
阅读 22
收藏 0
点赞 0
评论 0
def get_recommends(user_id):
url = 'https://medium.com/_/api/users/' + user_id + '/profile/stream?source=has-recommended'
cj = cookielib.MozillaCookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
req = urllib2.Request(url)
req.add_header("User-agent", 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/50.0.2661.102 Safari/537.36')
response = opener.open(req, timeout=10)
data = response.read()
recommends = re.findall('w":{"postId":"(.*?)"},"randomId"', data)
recommends_set = set(recommends)
to = re.findall('"to":"(.*?)","source":"has-recommended"', data)
while to:
url = 'https://medium.com/_/api/users/' + user_id + '/profile/stream?source=has-recommended&to=' + to[0]
cj = cookielib.MozillaCookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
req = urllib2.Request(url)
req.add_header("User-agent", 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/50.0.2661.102 Safari/537.36')
response = opener.open(req, timeout=10)
data = response.read()
recommends = re.findall('w":{"postId":"(.*?)"},"randomId"', data)
recommends_set.update(recommends)
to = re.findall('"to":"(.*?)","source":"has-recommended"', data)
return list(recommends_set)
medium_crawler.py 文件源码
项目:Medium-crawler-with-data-analyzer
作者: lifei96
项目源码
文件源码
阅读 21
收藏 0
点赞 0
评论 0
def get_highlights(user_id):
url = 'https://medium.com/_/api/users/' + user_id + '/profile/stream?source=quotes'
cj = cookielib.MozillaCookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
req = urllib2.Request(url)
req.add_header("User-agent", 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/50.0.2661.102 Safari/537.36')
response = opener.open(req, timeout=10)
data = response.read()
highlights = re.findall('","postId":"(.*?)","userId":"', data)
highlights_set = set(highlights)
to = re.findall('"to":"(.*?)","source":"quotes"', data)
while to:
url = 'https://medium.com/_/api/users/' + user_id + '/profile/stream?source=quotes&to=' + to[0]
cj = cookielib.MozillaCookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
req = urllib2.Request(url)
req.add_header("User-agent", 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/50.0.2661.102 Safari/537.36')
response = opener.open(req, timeout=10)
data = response.read()
highlights = re.findall('","postId":"(.*?)","userId":"', data)
highlights_set.update(highlights)
to = re.findall('"to":"(.*?)","source":"quotes"', data)
return list(highlights_set)
medium_crawler.py 文件源码
项目:Medium-crawler-with-data-analyzer
作者: lifei96
项目源码
文件源码
阅读 21
收藏 0
点赞 0
评论 0
def get_twitter_profile(username, twitter_id):
url = "https://twitter.com/" + str(twitter_id) + "?lang=en"
cj = cookielib.MozillaCookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
req = urllib2.Request(url)
req.add_header("User-agent", 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/50.0.2661.102 Safari/537.36')
response = opener.open(req, timeout=10)
data = response.read()
profile_data = re.findall('class="json-data" value="(.*?)">', data)
profile = json.loads(profile_data[0].replace('"', '"'))
profile.pop("promptbirdData", None)
profile.pop("wtfOptions", None)
profile.pop("typeaheadData", None)
profile.pop("dm", None)
profile.pop("initialState", None)
profile.pop("activeHashflags", None)
profile.pop("keyboardShortcuts", None)
profile.pop("deciders", None)
out = codecs.open("./Twitter/%s_t.json" % username, 'w', 'utf-8')
out.write(json.dumps(profile, indent=4))
out.close()
def __init__(self):
self.articles = []
self.query = None
self.cjar = MozillaCookieJar()
# If we have a cookie file, load it:
if ScholarConf.COOKIE_JAR_FILE and \
os.path.exists(ScholarConf.COOKIE_JAR_FILE):
try:
self.cjar.load(ScholarConf.COOKIE_JAR_FILE,
ignore_discard=True)
ScholarUtils.log('info', 'loaded cookies file')
except Exception as msg:
ScholarUtils.log('warn', 'could not load cookies file: %s' % msg)
self.cjar = MozillaCookieJar() # Just to be safe
self.opener = build_opener(HTTPCookieProcessor(self.cjar))
self.settings = None # Last settings object, if any
def __init__(self):
self.articles = []
self.query = None
self.cjar = MozillaCookieJar()
# If we have a cookie file, load it:
if ScholarConf.COOKIE_JAR_FILE and \
os.path.exists(ScholarConf.COOKIE_JAR_FILE):
try:
self.cjar.load(ScholarConf.COOKIE_JAR_FILE,
ignore_discard=True)
ScholarUtils.log('info', 'loaded cookies file')
except Exception as msg:
ScholarUtils.log('warn', 'could not load cookies file: %s' % msg)
self.cjar = MozillaCookieJar() # Just to be safe
self.opener = build_opener(HTTPCookieProcessor(self.cjar))
self.settings = None # Last settings object, if any
def test_bad_magic(self):
from cookielib import LWPCookieJar, MozillaCookieJar, LoadError
# IOErrors (eg. file doesn't exist) are allowed to propagate
filename = test_support.TESTFN
for cookiejar_class in LWPCookieJar, MozillaCookieJar:
c = cookiejar_class()
try:
c.load(filename="for this test to work, a file with this "
"filename should not exist")
except IOError, exc:
# exactly IOError, not LoadError
self.assertEqual(exc.__class__, IOError)
else:
self.fail("expected IOError for invalid filename")
# Invalid contents of cookies file (eg. bad magic string)
# causes a LoadError.
try:
f = open(filename, "w")
f.write("oops\n")
for cookiejar_class in LWPCookieJar, MozillaCookieJar:
c = cookiejar_class()
self.assertRaises(LoadError, c.load, filename)
finally:
try: os.unlink(filename)
except OSError: pass
def test_bad_magic(self):
from cookielib import LWPCookieJar, MozillaCookieJar, LoadError
# IOErrors (eg. file doesn't exist) are allowed to propagate
filename = test_support.TESTFN
for cookiejar_class in LWPCookieJar, MozillaCookieJar:
c = cookiejar_class()
try:
c.load(filename="for this test to work, a file with this "
"filename should not exist")
except IOError, exc:
# exactly IOError, not LoadError
self.assertEqual(exc.__class__, IOError)
else:
self.fail("expected IOError for invalid filename")
# Invalid contents of cookies file (eg. bad magic string)
# causes a LoadError.
try:
f = open(filename, "w")
f.write("oops\n")
for cookiejar_class in LWPCookieJar, MozillaCookieJar:
c = cookiejar_class()
self.assertRaises(LoadError, c.load, filename)
finally:
try: os.unlink(filename)
except OSError: pass
def __init__(self):
self.articles = []
self.query = None
self.cjar = MozillaCookieJar()
# If we have a cookie file, load it:
if ScholarConf.COOKIE_JAR_FILE and \
os.path.exists(ScholarConf.COOKIE_JAR_FILE):
try:
self.cjar.load(ScholarConf.COOKIE_JAR_FILE,
ignore_discard=True)
print "Using cookie file"
ScholarUtils.log('info', 'loaded cookies file')
except Exception as msg:
print "Ignoring cookie file: %s" % msg
ScholarUtils.log('warn', 'could not load cookies file: %s' % msg)
self.cjar = MozillaCookieJar() # Just to be safe
self.opener = build_opener(HTTPCookieProcessor(self.cjar))
self.settings = None # Last settings object, if any
airbnb_superhost_photo_crawler_from_failed_list.py 文件源码
项目:Airbnb-superhosts-crawler-with-data-analyzer
作者: lifei96
项目源码
文件源码
阅读 21
收藏 0
点赞 0
评论 0
def get_name(uid):
url = 'https://www.airbnb.com/users/show/' + uid + '?locale=en'
cj = cookielib.MozillaCookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
req = urllib2.Request(url)
req.add_header("User-agent", 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/50.0.2661.102 Safari/537.36')
try:
response = opener.open(req, timeout=10)
except Exception as inst:
print type(inst)
print inst.args
print inst
print('-----fail to get name data')
return ''
data = response.read()
name = re.findall('Hey, I’m (.*?)!', data)
if len(name):
return name[0]
else:
return ''
airbnb_superhost_photo_crawler.py 文件源码
项目:Airbnb-superhosts-crawler-with-data-analyzer
作者: lifei96
项目源码
文件源码
阅读 17
收藏 0
点赞 0
评论 0
def get_name(uid):
url = 'https://www.airbnb.com/users/show/' + uid + '?locale=en'
cj = cookielib.MozillaCookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
req = urllib2.Request(url)
req.add_header("User-agent", 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/50.0.2661.102 Safari/537.36')
try:
response = opener.open(req, timeout=10)
except Exception as inst:
print type(inst)
print inst.args
print inst
print('-----fail to get name data')
return ''
data = response.read()
name = re.findall('Hey, I’m (.*?)!', data)
if len(name):
return name[0]
else:
return ''
def clearCookie(self, cookiefile, leaveNames=[], removeNames=None, ignore_discard = True):
try:
toRemove = []
if not self.useMozillaCookieJar:
cj = cookielib.LWPCookieJar()
else:
cj = cookielib.MozillaCookieJar()
cj.load(cookiefile, ignore_discard = ignore_discard)
for cookie in cj:
if cookie.name not in leaveNames and (None == removeNames or cookie.name in removeNames):
toRemove.append(cookie)
for cookie in toRemove:
cj.clear(cookie.domain, cookie.path, cookie.name)
cj.save(cookiefile, ignore_discard = ignore_discard)
except Exception:
printExc()
return False
return True
def __init__(self):
self.br = mechanize.Browser()
#self.cj = cookielib.LWPCookieJar()
self.cj = cookielib.MozillaCookieJar()
self.br.set_cookiejar(self.cj)
self.br.set_handle_equiv(True)
self.br.set_handle_referer(True)
self.br.set_handle_robots(False)
self.br.addheaders = [('User-agent', 'Firefox')]
self.item_url = 'http://shop.bdgastore.com/collections/footwear/products/y-3-pureboost-zg'
# Create variables for user credentials and a function to import them
def login(form_data):
url = 'http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.18)'
headers = ('User-Agent', 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:41.0) Gecko/20100101 Firefox/41.0')
cookie = cookielib.MozillaCookieJar(cookie_file)
handler = urllib2.HTTPCookieProcessor(cookie)
opener = urllib2.build_opener(handler)
opener.addheaders.append(headers)
req = opener.open(url, form_data)
redirect_result = req.read()
login_pattern = r'location.replace\(\'(.*?)\'\)'
login_url = re.search(login_pattern, redirect_result).group(1)
opener.open(login_url).read()
cookie.save(cookie_file, ignore_discard=True, ignore_expires=True)
def request_image_url(image_path):
cookie = cookielib.MozillaCookieJar()
cookie.load(cookie_file, ignore_expires=False, ignore_discard=True)
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))
image_url = 'http://picupload.service.weibo.com/interface/pic_upload.php?mime=image%2Fjpeg&data=base64&url=0&markpos=1&logo=&nick=0&marks=1&app=miniblog'
b = base64.b64encode(file(image_path).read())
data = urllib.urlencode({'b64_data': b})
result = opener.open(image_url, data).read()
result = re.sub(r"<meta.*</script>", "", result, flags=re.S)
image_result = json.loads(result)
image_id = image_result.get('data').get('pics').get('pic_1').get('pid')
return 'https://ws3.sinaimg.cn/large/%s.jpg' % image_id
def login(self,login_url="http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.18)"):
j_data = self.get_sso()
postdata = {
'entry': "weibo",
'gateway': '1',
'from': '',
'savestate': '7',
'userticket': '1',
'pagerefer': "",
'vsnf': '1',
'su': self.get_user(),
'service': 'miniblog',
'servertime': j_data.get("servertime"),
'nonce': j_data.get("nonce"),
'pwencode': 'rsa2',
'rsakv': j_data.get("rsakv"),
'sp': self.get_passwd(j_data.get("pubkey"), j_data.get("servertime"), j_data.get("nonce")),
'sr': "1440*900",
'encoding': 'UTF-8',
'prelt': '503',
'url': 'http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack',
'returntype': 'META'
}
cookie=cookielib.MozillaCookieJar("Cookie.txt")
opener=urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))
result=opener.open(login_url,urllib.urlencode(postdata))
cookie.save(ignore_discard=True, ignore_expires=True)
with open(os.path.join(os.path.dirname(__file__),"test.html"),"wb") as f:
f.write(result.read())
print u"????"
def login(self, login_url="http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.18)"):
j_data = self.get_sso()
postdata = {
'entry': "weibo",
'gateway': '1',
'from': '',
'savestate': '7',
'userticket': '1',
'pagerefer': "",
'vsnf': '1',
'su': self.get_user(),
'service': 'miniblog',
'servertime': j_data.get("servertime"),
'nonce': j_data.get("nonce"),
'pwencode': 'rsa2',
'rsakv': j_data.get("rsakv"),
'sp': self.get_passwd(j_data.get("pubkey"), j_data.get("servertime"), j_data.get("nonce")),
'sr': "1440*900",
'encoding': 'UTF-8',
'prelt': '503',
'url': 'http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack',
'returntype': 'META'
}
cookie = cookielib.MozillaCookieJar("Cookie.txt")
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))
result = opener.open(login_url, urllib.urlencode(postdata))
cookie.save(ignore_discard=True, ignore_expires=True)
html = opener.open(
"http://weibo.com/p/1005055887581312").read()
with open(os.path.join(os.path.dirname(__file__), "text.html"), "wb") as f:
f.write(html)
print "????"
def test_missing_value(self):
from cookielib import MozillaCookieJar, lwp_cookie_str
# missing = sign in Cookie: header is regarded by Mozilla as a missing
# name, and by cookielib as a missing value
filename = test_support.TESTFN
c = MozillaCookieJar(filename)
interact_netscape(c, "http://www.acme.com/", 'eggs')
interact_netscape(c, "http://www.acme.com/", '"spam"; path=/foo/')
cookie = c._cookies["www.acme.com"]["/"]["eggs"]
self.assertIsNone(cookie.value)
self.assertEqual(cookie.name, "eggs")
cookie = c._cookies["www.acme.com"]['/foo/']['"spam"']
self.assertIsNone(cookie.value)
self.assertEqual(cookie.name, '"spam"')
self.assertEqual(lwp_cookie_str(cookie), (
r'"spam"; path="/foo/"; domain="www.acme.com"; '
'path_spec; discard; version=0'))
old_str = repr(c)
c.save(ignore_expires=True, ignore_discard=True)
try:
c = MozillaCookieJar(filename)
c.revert(ignore_expires=True, ignore_discard=True)
finally:
os.unlink(c.filename)
# cookies unchanged apart from lost info re. whether path was specified
self.assertEqual(
repr(c),
re.sub("path_specified=%s" % True, "path_specified=%s" % False,
old_str)
)
self.assertEqual(interact_netscape(c, "http://www.acme.com/foo/"),
'"spam"; eggs')
def test_missing_value(self):
from cookielib import MozillaCookieJar, lwp_cookie_str
# missing = sign in Cookie: header is regarded by Mozilla as a missing
# name, and by cookielib as a missing value
filename = test_support.TESTFN
c = MozillaCookieJar(filename)
interact_netscape(c, "http://www.acme.com/", 'eggs')
interact_netscape(c, "http://www.acme.com/", '"spam"; path=/foo/')
cookie = c._cookies["www.acme.com"]["/"]["eggs"]
self.assertIsNone(cookie.value)
self.assertEqual(cookie.name, "eggs")
cookie = c._cookies["www.acme.com"]['/foo/']['"spam"']
self.assertIsNone(cookie.value)
self.assertEqual(cookie.name, '"spam"')
self.assertEqual(lwp_cookie_str(cookie), (
r'"spam"; path="/foo/"; domain="www.acme.com"; '
'path_spec; discard; version=0'))
old_str = repr(c)
c.save(ignore_expires=True, ignore_discard=True)
try:
c = MozillaCookieJar(filename)
c.revert(ignore_expires=True, ignore_discard=True)
finally:
os.unlink(c.filename)
# cookies unchanged apart from lost info re. whether path was specified
self.assertEqual(
repr(c),
re.sub("path_specified=%s" % True, "path_specified=%s" % False,
old_str)
)
self.assertEqual(interact_netscape(c, "http://www.acme.com/foo/"),
'"spam"; eggs')
medium_topstories_crawler.py 文件源码
项目:Medium-crawler-with-data-analyzer
作者: lifei96
项目源码
文件源码
阅读 21
收藏 0
点赞 0
评论 0
def get_top_stories():
current_date = START_DATE
while current_date <= END_DATE:
top_stories = TopStories()
date_string = current_date.strftime("%B-%d-%Y").lower()
url = "https://medium.com/browse/top/" + date_string
top_stories.data['date'] = current_date.isoformat()
top_stories.data['url'] = url
cj = cookielib.MozillaCookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
req = urllib2.Request(url)
req.add_header("User-agent", 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/50.0.2661.102 Safari/537.36')
response = opener.open(req, timeout=10)
data = response.read()
stories = []
story_url = re.findall('<a class="link link--darken" href="(.*?)\?source=top_stories---------[0-9]*-" data-action="open-post"', data)
num = len(story_url)
for i in range(num):
story_data = get_story(story_url[i]).data
if story_data['success']:
stories.append(story_data)
print(i)
top_stories.data['stories'] = stories
out = codecs.open("./TopStories/%s.json" % current_date.isoformat(), 'w', 'utf-8')
out.write(top_stories.getstr())
out.close()
print("-----%s obtained" % current_date.isoformat())
current_date = current_date + datetime.timedelta(days=1)
def baiduInitialization(filename):
global _cj
_cj = cookielib.MozillaCookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(_cj))
urllib2.install_opener(opener)
if os.path.exists(filename):
_cj.load(filename, True)
def load_cookie(self, filename):
try:
load_cookiejar = cookielib.MozillaCookieJar()
try:
load_cookiejar.load(self.cookie_dir+'/'+filename, ignore_discard=True)
except:
pass
self.sess.cookies = load_cookiejar
return True
except Exception, e:
logging.error('Exp {0} : {1}'.format(FuncName(), e))
return False
def saveCookies(self, cookiejar):
MozillaCookieJar = cookielib.MozillaCookieJar()
for c in cookiejar:
args = dict(vars(c).items())
args['rest'] = args['_rest']
del args['_rest']
c = cookielib.Cookie(**args)
MozillaCookieJar.set_cookie(c)
MozillaCookieJar.save('youdaoCookies', ignore_discard=True)