def process_request(self, request, spider):
if spider.name == "jobbole":
self.browser.get(request.url)
import time
time.sleep(3)
print ("??:{0}".format(request.url))
return HtmlResponse(url=self.browser.current_url, body=self.browser.page_source, encoding="utf-8", request=request)
#linux?
# from pyvirtualdisplay import Display
# display = Display(visible=0, size=(800, 600))
# display.start()
#
# browser = webdriver.Chrome()
# browser.get()
python类Chrome()的实例源码
def populate_text_field(driver, element_locator, text):
'''Populates text field with provided text
Args:
element_locator ((selenium.webdriver.common.by.By., str)): element locator described using `By`. Take a look at `Locate elements By <http://selenium-python.readthedocs.io/api.html#locate-elements-by>`_ for more info.
text (str): text to populate text field with.
Example:
::
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium_extensions.core import populate_text_field
driver = webdriver.Chrome()
...
populate_text_field(driver, (By.CLASS_NAME, 'textbox'), 'some text')
'''
input_element = driver.find_element(*element_locator)
input_element.send_keys(text)
def __init__(self, username, passwd, playground, groupname):
self.playground = playground
chrome_options = webdriver.ChromeOptions()
#chrome_options.add_argument('--start-maximized')
chrome_options.add_argument('--proxy-server=http://xx.mioffice.cn:8888')
self.driver = webdriver.Chrome(chrome_options = chrome_options)
driver = self.driver
driver.implicitly_wait(300) # seconds
driver.get('https://rankade.com/')
assert 'rankade' in driver.title
driver.find_element_by_css_selector("a.sign-button.sign-in-button").click()
assert 'Sign in' in driver.title
input = driver.find_element_by_name("email")
input.send_keys(username)
input = driver.find_element_by_name("password")
input.send_keys(passwd)
driver.find_element_by_name("submit").click()
assert 'rankade' in driver.title
driver.find_element_by_id("dashboardLink").click()
# assert 'rankade - My dashboard' in driver.title
driver.find_element_by_link_text(groupname).click()
def chrome(self):
# https://github.com/SeleniumHQ/selenium/blob/master/py/selenium/webdriver/remote/webdriver.py
# http://www.guguncube.com/2983/python-testing-selenium-with-google-chrome
# https://gist.github.com/addyosmani/5336747
# http://blog.likewise.org/2015/01/setting-up-chromedriver-and-the-selenium-webdriver-python-bindings-on-ubuntu-14-dot-04/
# https://sites.google.com/a/chromium.org/chromedriver/getting-started
# http://stackoverflow.com/questions/8255929/running-webdriver-chrome-with-selenium
chrome = webdriver.Chrome()
return chrome
# @property
# def firefox(self):
# profile = webdriver.FirefoxProfile()
# #firefox = webdriver.Firefox(firefox_profile=profile)
# firefox = WebDriver(firefox_profile=profile)
# return firefox
def loadCartAndCheckout(self):
#Import Cookies
driver = webdriver.Chrome(executable_path="./chromedriver")
driver.delete_all_cookies()
driver.get(self.URL_cart)
cookies = requests.utils.dict_from_cookiejar(self.user_session.cookies)
for cookie in cookies.items():
cookie_dict = {'name': '',
'value': '',
'path': '/'}
cookie_dict['name'] = cookie[0]
cookie_dict['value'] = cookie[1]
driver.add_cookie(cookie_dict)
driver.get(self.URL_cart)
#time.sleep(5)
#driver.quit()
def get_js_errors(self):
'''
Uses the JSErrorCollector plugin for Chrome / Firefox to get any JS errors.
[
{
'sourceName': u'tests/html/js_error.html',
'pageUrl': u'tests/html/js_error.html',
'errorMessage': 'ReferenceError: b is not defined',
'lineNumber': 7
}
]
'''
if self.driver in ('Chrome', 'Firefox'):
return self.js('return window.JSErrorCollector_errors ? window.JSErrorCollector_errors.pump() : []')
else:
print("Checking for JS errors with this method only works in Firefox or Chrome")
return []
def screenshot(self, path=None):
'''
Saves a screenshot. Takes a path as a parameter.
Parameters
----------
path: str
Defaults to: /tmp/selenium-screenshot.png
'''
if not path:
path = '/tmp/selenium-screenshot.png'
# if isinstance(self.browser, webdriver.remote.webdriver.WebDriver):
# # Get base64 screenshot from the remote.
# base64_data = self.browser.get_screenshot_as_base64()
# ss_data = base64.decodestring(base64_data)
# with open(path, 'w') as f:
# f.write(ss_data)
# f.close()
# else:
if self.browser == 'chrome-headless':
print("You are running Chrome in headless mode. Screenshots will be blank.")
else:
self.browser.save_screenshot(path)
def gethtml(zurl,str_fname):
mobileEmulation = {'deviceName': 'Apple iPhone 6'}
options = webdriver.ChromeOptions()
options.add_experimental_option('mobileEmulation', mobileEmulation)
driver = webdriver.Chrome(executable_path='chromedriver.exe', chrome_options=options)
driver.get(zurl)
time.sleep(5)
result = []
# for i in range(0,300): #???0?20?????i?
for i in range(0, 1): # ???0?3?????i?
print('????' + str(i))
myscroll(driver)
time.sleep(2)
st=time.strftime("%Y%m%d",time.localtime())
# print(driver.page_source, file=open('itg201703.html', 'w', encoding='utf-8'))
print(driver.page_source, file=open(str_fname+"-"+st+".html", 'w', encoding='utf-8'))
print("?????????")
print(driver.title)
driver.quit()
def _get_webdriver(self):
"""Return a webdriver instance and set it up
with the according profile/ proxies.
Chrome is quite fast, but not as stealthy as PhantomJS.
Returns:
The appropriate webdriver mode according to self.browser_type.
If no webdriver mode could be found, return False.
"""
if self.browser_type == 'chrome':
return self._get_Chrome()
elif self.browser_type == 'firefox':
return self._get_Firefox()
elif self.browser_type == 'phantomjs':
return self._get_PhantomJS()
return False
def setup():
global browser
driverPath = os.getcwd()+'/chromedriver'
url = r'https://www.instagram.com/accounts/login/'
chromeOptions = webdriver.ChromeOptions()
#chromeOptions.binary_location='/opt/google/chrome/google-chrome'
''' #These arguments make chrome run headless.Unfortunately the chrome headless is in beta and hence considerably slow.
chromeOptions.add_argument("--headless")
chromeOptions.add_argument("--disable-gpu")
chromeOptions.add_argument("--start-fullscreen")
'''
prefs = {"profile.managed_default_content_settings.images":2}
chromeOptions.add_experimental_option("prefs",prefs)
print('reached 1')
browser = webdriver.Chrome(driverPath,chrome_options=chromeOptions)
print('reached')
#browser.set_window_position(-10000000, 0) #move chrome away from view
print('Fetching login page..')
browser.get(url)
print('reached login page')
def getHtml(url, loadmore = False, waittime = 2):
browser = webdriver.Chrome('chromedriver')
browser.get(url)
time.sleep(waittime)
if loadmore:
while True:
try:
next_button = browser.find_element_by_class_name("more")
next_button.click()
time.sleep(waittime)
except:
break
html = browser.page_source
browser.quit()
return html
# for test
#url = "https://movie.douban.com/tag/#/?sort=S&range=9,10&tags=??,??,??"
#html = getHtml(url)
#print(html)
scrape_ratings_threaded.py 文件源码
项目:glassdoor-analysis
作者: THEdavehogue
项目源码
文件源码
阅读 29
收藏 0
点赞 0
评论 0
def glassdoor_login():
'''
Function to create a selenium Chrome driver and login using my credentials
INPUT:
None
OUTPUT:
webdriver.Chrome object
'''
url = 'https://www.glassdoor.com/profile/login_input.htm'
driver = webdriver.Chrome()
soup = get_soup(driver, url)
user = driver.find_element_by_name('username')
user.click()
user.send_keys(USER_ID)
pwrd = driver.find_element_by_xpath('//*[@id="signInPassword"]')
pwrd.click()
pwrd.send_keys(PASSWORD)
sign_in = driver.find_element_by_id('signInBtn')
sign_in.click()
return driver
def give_me_the_page(n, user_name, password, broswer, pt = None):
if not pt:
if broswer=='Chrome':
pt = webdriver.Chrome()
elif broswer=='Safari':
pt = webdriver.Safari()
else:
pt = webdriver.PhantomJS()
pt.get('http://electsys.sjtu.edu.cn/edu/login.aspx')
time.sleep(1)
pt.execute_script("""var img=document.getElementById('form-input').getElementsByTagName('div')[2].getElementsByTagName('img')[0];
var d=document.createElement('CANVAS');
var cxt=d.getContext('2d');
d.width=img.width;
d.height=img.height;
cxt.drawImage(img,0,0);
img.src=d.toDataURL('png');""")
def get_driver_path():
chrome_driver_folder_name = ""
if platform == "linux" or platform == "linux2":
# linux
chrome_driver_folder_name = "linux_x64"
elif platform == "darwin":
# OS X
chrome_driver_folder_name = "mac_x64"
else:
raise ValueError("Platform not identified")
chrome_driver_path = os.path.normpath(
os.path.join(os.path.dirname(os.path.abspath(__file__)),
os.pardir, os.pardir, os.pardir,
"resources", "chrome", chrome_driver_folder_name,
"chromedriver"))
assert os.path.isfile(chrome_driver_path), \
"Chrome driver must exists: %s" % chrome_driver_path
return chrome_driver_path
def strat_isml(thread):
uaList = []
for line in open('Base_Data\\ualist.txt'):
uaList.append(line[:-1])
open('Base_Data\\ualist.txt').close()
i = random.choice(uaList)
option = webdriver.ChromeOptions()
option.add_argument('--user-agent={}'.format(i))
option.add_argument('--profile-directory=Default')
option.add_argument('--user-data-dir=c:\\Users\\{}'.format(thread))
with open("Base_Data\\ChromeOptions.txt") as a:
for line in a:
option.add_argument(line)
path1 = 'C:\\Program Files (x86)\\Google\\Chrome\\Application\\chromedriver.exe'
path2 = 'C:\\Program Files\\Google\\Chrome\\Application\\chromedriver.exe'
try:
dr = webdriver.Chrome(path1,chrome_options=option)
except:
dr = webdriver.Chrome(path2,chrome_options=option)
return dr,uaList
def fb_login(self):
usr, pwd = get_details()
try:
driver = webdriver.Chrome('/usr/bin/chromedriver')
except:
driver = webdriver.Chrome('/usr/lib/chromium-browser/chromedriver')
driver.get('https://www.facebook.com/')
user_id = driver.find_element_by_id('email')
user_id.send_keys(usr)
sleep(2)
password = driver.find_element_by_id('pass')
password.send_keys(pwd)
sleep(2)
submit = driver.find_element_by_id('loginbutton')
submit.click()
if six.PY2:
raw_input('Enter anything to end the session: ')
else:
input('Enter anything to end the session: ')
driver.quit()
def netease():
options = webdriver.ChromeOptions()
options.add_argument(
'--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36')
driver = webdriver.Chrome(executable_path=r'C:\software\chrome\chromedriver.exe',
chrome_options=options)
driver.implicitly_wait(40)
driver.get("http://30daydo.com/")
elem_user = driver.find_element_by_tag_name("??")
elem_user.click()
''''
elem_pwd = driver.find_element_by_name("password")
elem_pwd.send_keys("123456")
elem_pwd.send_keys(Keys.RETURN)
'''
time.sleep(5)
assert "baidu" in driver.title
driver.close()
driver.quit()
def key_operation():
# ????
options = webdriver.ChromeOptions()
options.add_argument(
'--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36')
browser = webdriver.Chrome(executable_path=r'C:\software\chrome\chromedriver.exe',
chrome_options=options) #
browser.implicitly_wait(60)
browser.get('https://m.fang.com/fangjia/sz_list_pinggu/')
#browser.send_keys(Keys.DOWN)
count=0
while count<190:
browser.find_element_by_xpath("//body[@class='whitebg']").send_keys(Keys.PAGE_DOWN)
time.sleep(5)
count=count+1
raw_input('enter')
def __init__(self, username, passwd):
self.driver = webdriver.Chrome()
driver = self.driver
driver.implicitly_wait(300) # seconds
driver.get('https://rankade.com/')
assert 'rankade' in driver.title
driver.find_element_by_css_selector("a.sign-button.sign-in-button").click()
assert 'Sign in' in driver.title
input = driver.find_element_by_name("email")
input.send_keys(username)
input = driver.find_element_by_name("password")
input.send_keys(passwd)
driver.find_element_by_name("submit").click()
assert 'rankade' in driver.title
driver.find_element_by_id("dashboardLink").click()
# assert 'rankade - My dashboard' in driver.title
driver.find_element_by_link_text("mifoosball").click()
# driver.save_screenshot('mifoosball.png')
# assert 'mifoosball' in driver.title
def takeSnapshot(report_name, root_path, script_name=None, *args, **kwargs):
""" """
options = webdriver.ChromeOptions()
options.add_argument("headless")
driver = webdriver.Chrome(os.path.join(root_path, 'system', 'webDrivers', 'chromedriver'), chrome_options=options)
if not script_name:
script_name = report_name
url_str = url_for('ares.run_report', report_name=report_name, script_name=script_name, **kwargs)
if report_name.startswith('_'):
report_dir = os.path.join(root_path, config.ARES_FOLDER, 'reports', report_name)
if report_name == '_AresTemplates':
url_str = url_for('ares.run_template', template=script_name)
else:
report_dir = os.path.join(root_path, config.ARES_USERS_LOCATION, report_name)
driver.get(url_str)
driver.save_screenshot(os.path.join(report_dir, '%s.png' % script_name ))
driver.quit()
def scrape(screen_name, since_date, until_date, include_retweets=True, wait_secs=5):
log.info("Scraping %s since %s until %s", screen_name, since_date, until_date)
driver = webdriver.Chrome()
try:
driver.implicitly_wait(wait_secs)
url = "https://twitter.com/search?f=tweets&vertical=default&q=from:{}+since:{}+until:{}&src=typd".format(screen_name, since_date.isoformat(),
until_date.isoformat())
if include_retweets:
url += "+include:retweets"
log.debug("Getting %s", url)
driver.get(url)
scroll_count = 0
last_tweet_count = 0
while last_tweet_count != len(driver.find_elements_by_class_name("original-tweet")):
scroll_count += 1
last_tweet_count = len(driver.find_elements_by_class_name("original-tweet"))
log.debug("Scrolling down %s. Found %s tweets.", scroll_count, last_tweet_count)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(wait_secs)
return set([e.get_attribute("data-tweet-id") for e in driver.find_elements_by_class_name("original-tweet")])
finally:
driver.close()
driver.quit()
def create_selenium_driver(browser='chrome'):
# set default browser string based on env (if available)
env_browser = os.environ.get('TOASTER_TESTS_BROWSER')
if env_browser:
browser = env_browser
if browser == 'chrome':
return webdriver.Chrome(
service_args=["--verbose", "--log-path=selenium.log"]
)
elif browser == 'firefox':
return webdriver.Firefox()
elif browser == 'marionette':
capabilities = DesiredCapabilities.FIREFOX
capabilities['marionette'] = True
return webdriver.Firefox(capabilities=capabilities)
elif browser == 'ie':
return webdriver.Ie()
elif browser == 'phantomjs':
return webdriver.PhantomJS()
else:
msg = 'Selenium driver for browser %s is not available' % browser
raise RuntimeError(msg)
def __scrollbars_hide(self):
"""Hides Chrome's scrollbars.
Creates a new <style> element to contain the CSS rule for hiding
the browser scrollbars. `::-webkit-scrollbar {width: 0px;}`
Args:
None
Returns:
None
"""
self.execute_script(
"var sheet = document.createElement('style'); "
"sheet.id = 'chrome_screenshot_fix'; "
"sheet.innerHTML = '::-webkit-scrollbar {width: 0px;}'; "
"document.body.appendChild(sheet); ")
def __screenshot_png(self, func):
"""Helper function that produces the screenshot.
Produces a stitched together screenshot of the current webpage.
Automatically hides and restores Chrome's scrollbars.
Args:
func: A helper function which will be passed the finalized
screenshot. Whatever is returned by `func` is returned
by this function.
Returns:
Whatever is returned by func(screenshot).
"""
self.__scrollbars_hide()
doc_width = self.__document_width
doc_height = self.__document_height
with Image(width=doc_width*2, height=doc_height*2) as screenshot:
for data, rect in self.__iter_screenshots((doc_width, doc_height)):
with Image(blob=base64.b64decode(data),
format='png') as shot:
screenshot.composite(image=shot,
left=rect[LEFT]*2,
top=rect[TOP]*2)
del data
_ret = func(screenshot)
self.__scrollbars_restore()
return _ret
def login(url):
login_name = input('???QQ?\n')
login_password = input('???QQ??\n')
driver = webdriver.Chrome()
driver.get(url)
time.sleep(3)
login_type = driver.find_element_by_id('switcher_plogin')
login_type.click()
username = driver.find_element_by_id('u')
username.clear()
password = driver.find_element_by_id('p')
password.clear()
username.send_keys(login_name)
password.send_keys(login_password)
submit = driver.find_element_by_id('login_button')
submit.click()
time.sleep(5)
cookies = driver.get_cookies()
driver.close()
return cookies
def login():
acount_num = input('?????:\n')
passwd_str = input('?????:\n')
driver = webdriver.Chrome(executable_path='/Users/resolvewang/Documents/program/driver/chromedriver')
url = 'http://mail.163.com/'
driver.get(url)
time.sleep(5)
# 163??????iframe???????????????iframe
driver.switch_to.frame('x-URS-iframe')
acount = driver.find_element_by_name('email')
acount.clear()
acount.send_keys(acount_num)
passwd = driver.find_element_by_name('password')
passwd.clear()
passwd.send_keys(passwd_str)
time.sleep(3)
click_button = driver.find_element_by_id('dologin')
click_button.click()
time.sleep(5)
cur_cookies = driver.get_cookies()[0]
return cur_cookies
def login(login_url, login_name, login_passwd):
driver = webdriver.Chrome()
driver.get(login_url)
time.sleep(5)
login_tab_right = driver.find_element_by_class_name('login-tab-r')
login_tab_right.click()
account = driver.find_element_by_id('loginname')
password = driver.find_element_by_id('nloginpwd')
submit = driver.find_element_by_id('loginsubmit')
account.clear()
password.clear()
account.send_keys(login_name)
password.send_keys(login_passwd)
submit.click()
time.sleep(5)
jd_cookies = driver.get_cookies()
driver.close()
return jd_cookies
def login(name, passwd):
url = 'https://pan.baidu.com/'
# ?????Chrome?Phantomjs??????????????????????
driver = webdriver.Chrome(executable_path='/Users/resolvewang/Documents/program/driver/chromedriver')
driver.maximize_window()
driver.get(url)
print('????')
chg_field = driver.find_element_by_class_name('pass-login-tab').find_element_by_class_name('account-title')
chg_field.click()
name_field = driver.find_element_by_id('TANGRAM__PSP_4__userName')
name_field.send_keys(name)
passwd_field = driver.find_element_by_id('TANGRAM__PSP_4__password')
passwd_field.send_keys(passwd)
login_button = driver.find_element_by_id('TANGRAM__PSP_4__submit')
login_button.click()
time.sleep(20)
return driver.get_cookies()
def load_driver(config, vdisplay=None):
"""Initialize a weddriver selected in config with given config.
Args:
config (dict): The configuration loaded previously in Cabu.
Returns:
webdriver (selenium.webdriver): An instance of selenium webdriver or None.
"""
if config['DRIVER_NAME'] == 'Firefox':
driver = load_firefox(config)
elif config['DRIVER_NAME'] == 'Chrome':
driver = load_chrome(config)
elif config['DRIVER_NAME'] == 'PhantomJS':
driver = load_phantomjs(config)
elif not config.get('DRIVER_NAME'):
return None
else:
raise DriverException(vdisplay, 'Driver unrecognized.')
driver.set_page_load_timeout(config['DRIVER_PAGE_TIMEOUT'])
driver.set_window_size(config['DRIVER_WINDOWS_WIDTH'], config['DRIVER_WINDOWS_HEIGHT'])
return driver
def init_driver(self):
global driver
if self.is_initialized:
return
if self.driver_name == 'chrome':
driver = webdriver.Chrome(executable_path=self.driver_path)
elif self.driver_name == 'phantomjs':
driver = webdriver.PhantomJS(executable_path=self.driver_path)
elif self.driver_name == 'firefox':
driver = webdriver.Firefox(executable_path=self.driver_path)
else:
raise Exception(
'Driver "{}" is not supported'.format(self.driver_name))
self.is_initialized = True
driver.set_window_size(self.width, self.height)
driver.implicitly_wait(5)