requests 模块 介绍 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 >>> import requests>>> r = requests.get('https://api.github.com/events' )>>> r = requests.post('http://httpbin.org/post' , data = {'key' :'value' })>>> r = requests.put('http://httpbin.org/put' , data = {'key' :'value' })>>> r = requests.delete('http://httpbin.org/delete' )>>> r = requests.head('http://httpbin.org/get' )>>> r = requests.options('http://httpbin.org/get' )https://www.cnblogs.com/ranyonsue/p/5984001. html
基于GET请求 基本请求 1 2 3 import requestsresponse=requests.get('http://dig.chouti.com' ) print (response)
带参数的GET请求 -> params 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 import requestsresponse=requests.get('https://www.baidu.com/s?wd=python&pn=1' , headers={ 'User-Agent' :'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36' , }) print (response.text)from urllib.parse import urlencodewd='是大王啊' encode_res=urlencode({'k' :wd},encoding='utf-8' ) keyword=encode_res.split('=' )[1 ] print (keyword)url='https://www.baidu.com/s?wd=%s&pn=1' %keyword response=requests.get(url, headers={ 'User-Agent' :'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36' , }) res1=response.text from urllib.parse import urlencodewd='是大王 啊' pn=1 response=requests.get('https://www.baidu.com/s' , params={ 'wd' :wd, 'pn' :pn }, headers={ 'User-Agent' :'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36' , }) res2=response.text with open ('a.html' ,'w' ,encoding='utf-8' ) as f: f.write(res1) with open ('b.html' , 'w' , encoding='utf-8' ) as f: f.write(res2)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 Host Referer User-Agent Cookie import requestsresponse=requests.get('https://www.zhihu.com/explore' ) print (response.status_code) headers={ 'User-Agent' : 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36' , } respone=requests.get('https://www.zhihu.com/explore' , headers=headers) print (respone.status_code)
带参数的GET请求->cookies 1 2 3 4 5 6 7 8 9 import requestsCookies={ 'user_session' :'wGMHFJKgDcmRIVvcA14_Wrt_3xaUyJNsBnPbYzEL6L0bHcfc' , } response=requests.get('https://github.com/settings/emails' , cookies=Cookies) print ('378533872@qq.com' in response.text)
基于POST请求 介绍 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 HTTP默认的请求方法就是GET * 没有请求体 * 数据必须在1K之内! * GET请求数据会暴露在浏览器的地址栏中 GET请求常用的操作: 1. 在浏览器的地址栏中直接给出URL,那么就一定是GET请求 2. 点击页面上的超链接也一定是GET请求 3. 提交表单时,表单默认使用GET请求,但可以设置为POST (1 ). 数据不会出现在地址栏中 (2 ). 数据的大小没有上限 (3 ). 有请求体 (4 ). 请求体中如果存在中文,会使用URL编码!
发送post请求,模拟浏览器的登录行为 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 ''' 一 目标站点分析 浏览器输入https://github.com/login 然后输入错误的账号密码,抓包 发现登录行为是post提交到:https://github.com/session 而且请求头包含cookie 而且请求体包含: commit:Sign in utf8:✓ authenticity_token:lbI8IJCwGslZS8qJPnof5e7ZkCoSoMn6jmDTsL1r/m06NLyIbw7vCrpwrFAPzHMep3Tmf/TSJVoXWrvDZaVwxQ== login:egonlin password:123 二 流程分析 先GET:https://github.com/login拿到初始cookie与authenticity_token 返回POST:https://github.com/session, 带上初始cookie,带上请求体(authenticity_token,用户名,密码等) 最后拿到登录cookie ps:如果密码时密文形式,则可以先输错账号,输对密码,然后到浏览器中拿到加密后的密码,github的密码是明文 ''' import requestsimport rer1=requests.get('https://github.com/login' ) r1_cookie=r1.cookies.get_dict() authenticity_token=re.findall(r'name="authenticity_token".*?value="(.*?)"' ,r1.text)[0 ] data={ 'commit' :'Sign in' , 'utf8' :'✓' , 'authenticity_token' :authenticity_token, 'login' :'317828332@qq.com' , 'password' :'alex3714' } r2=requests.post('https://github.com/session' , data=data, cookies=r1_cookie ) login_cookie=r2.cookies.get_dict() r3=requests.get('https://github.com/settings/emails' , cookies=login_cookie) print ('317828332@qq.com' in r3.text)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 import requestsimport resession=requests.session() r1=session.get('https://github.com/login' ) authenticity_token=re.findall(r'name="authenticity_token".*?value="(.*?)"' ,r1.text)[0 ] data={ 'commit' :'Sign in' , 'utf8' :'✓' , 'authenticity_token' :authenticity_token, 'login' :'317828332@qq.com' , 'password' :'alex3714' } r2=session.post('https://github.com/session' , data=data, ) r3=session.get('https://github.com/settings/emails' ) print ('317828332@qq.com' in r3.text)
补充 1 2 3 4 5 6 7 8 9 10 11 12 13 14 requests.post(url='xxxxxxxx' , data={'xxx' :'yyy' }) requests.post(url='' , data={'' :1 ,}, headers={ 'content-type' :'application/json' }) requests.post(url='' , json={'' :1 ,}, )
响应Response response属性 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 import requestsrespone=requests.get('http://www.jianshu.com' ) print (respone.text)print (respone.content)print (respone.status_code)print (respone.headers)print (respone.cookies)print (respone.cookies.get_dict())print (respone.cookies.items())print (respone.url)print (respone.history)print (respone.encoding)from contextlib import closingwith closing(requests.get('xxx' ,stream=True )) as response: for line in response.iter_content(): pass
编码问题 1 2 3 4 5 import requestsresponse=requests.get('http://www.autohome.com/news' ) print (response.text)
获取二进制数据 1 2 3 4 5 6 import requestsresponse=requests.get('https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1509868306530&di=712e4ef3ab258b36e9f4b48e85a81c9d&imgtype=0&src=http%3A%2F%2Fc.hiphotos.baidu.com%2Fimage%2Fpic%2Fitem%2F11385343fbf2b211e1fb58a1c08065380dd78e0c.jpg' ) with open ('a.jpg' ,'wb' ) as f: f.write(response.content)
1 2 3 4 5 6 7 8 9 10 import requestsresponse=requests.get('https://gss3.baidu.com/6LZ0ej3k1Qd3ote6lo7D0j9wehsv/tieba-smallvideo-transcode/1767502_56ec685f9c7ec542eeaf6eac93a65dc7_6fe25cd1347c_3.mp4' , stream=True ) with open ('b.mp4' ,'wb' ) as f: for line in response.iter_content(): f.write(line)
解析json 1 2 3 4 5 6 7 8 9 10 11 import requestsresponse=requests.get('http://httpbin.org/get' ) import jsonres1=json.loads(response.text) res2=response.json() print (res1 == res2)
Redirection and History 先看官网的解释
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 默认情况下,请求将对除HEAD之外的所有谓词执行位置重定向。 我们可以使用Response对象的history属性来跟踪重定向。 回应。历史记录列表包含为完成请求而创建的响应对象。列表从最早的响应到最近的响应进行排序。 例如,GitHub将所有HTTP请求重定向到HTTPS: >>>r=请求.get('http://github.com' ) >>>r.url 'https://github.com/' >>>r.status_code(状态代码) 200 >>>r.history(历史记录) [<响应[301 ]>] 如果使用GET、OPTIONS、POST、PUT、PATCH或DELETE,则可以使用allow_redirects参数禁用重定向处理: >>>r=请求.get('http://github.com' ,allow_redirects=假) >>>r.status_code(状态代码) 301 >>>r.history(历史记录) [] 如果您使用HEAD,也可以启用重定向: >>>r=请求.head('http://github.com' ,allow_redirects=真) >>>r.url 'https://github.com/' >>>r.history(历史记录) [<响应[301 ]>]
利用github登录后跳转到主页面的例子来验证它
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 import requestsimport rer1=requests.get('https://github.com/login' ) r1_cookie=r1.cookies.get_dict() authenticity_token=re.findall(r'name="authenticity_token".*?value="(.*?)"' ,r1.text)[0 ] data={ 'commit' :'Sign in' , 'utf8' :'✓' , 'authenticity_token' :authenticity_token, 'login' :'317828332@qq.com' , 'password' :'alex3714' } r2=requests.post('https://github.com/session' , data=data, cookies=r1_cookie ) print (r2.status_code) print (r2.url) print (r2.history) print (r2.history[0 ].text) r2=requests.post('https://github.com/session' , data=data, cookies=r1_cookie, allow_redirects=False ) print (r2.status_code) print (r2.url) print (r2.history)
高级用法 SSL Cert Verification 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 import requestsrespone=requests.get('https://www.12306.cn' ) import requestsrespone=requests.get('https://www.12306.cn' ,verify=False ) print (respone.status_code)import requestsfrom requests.packages import urllib3urllib3.disable_warnings() respone=requests.get('https://www.12306.cn' ,verify=False ) print (respone.status_code)import requestsrespone=requests.get('https://www.12306.cn' , cert=('/path/server.crt' , '/path/key' )) print (respone.status_code)
使用代理 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 import requestsproxies={ 'http' :'http://egon:123@localhost:9743' , 'http' :'http://localhost:9743' , 'https' :'https://localhost:9743' , } respone=requests.get('https://www.12306.cn' , proxies=proxies) print (respone.status_code)import requestsproxies = { 'http' : 'socks5://user:pass@host:port' , 'https' : 'socks5://user:pass@host:port' } respone=requests.get('https://www.12306.cn' , proxies=proxies) print (respone.status_code)
超时设置 1 2 3 4 5 6 7 8 import requestsrespone=requests.get('https://www.baidu.com' , timeout=0.0001 )
认证设置 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 import requestsfrom requests.auth import HTTPBasicAuthr=requests.get('xxx' ,auth=HTTPBasicAuth('user' ,'password' )) print (r.status_code)import requestsr=requests.get('xxx' ,auth=('user' ,'password' )) print (r.status_code)
异常处理 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 import requestsfrom requests.exceptions import * try : r=requests.get('http://www.baidu.com' ,timeout=0.00001 ) except ReadTimeout: print ('===:' ) except RequestException: print ('Error' )
上传文件 1 2 3 4 import requestsfiles={'file' :open ('a.jpg' ,'rb' )} respone=requests.post('http://httpbin.org/post' ,files=files) print (respone.status_code)
selenium模块 官网:http://selenium-python.readthedocs.io
1 2 3 4 5 6 7 8 9 10 selenium最初是一个自动化测试工具,而爬虫中使用它主要是为了解决requests无法直接执行JavaScript代码的问题 selenium本质是通过驱动浏览器,完全模拟浏览器的操作,比如跳转、输入、点击、下拉等,来拿到网页渲染之后的结果,可支持多种浏览器 from selenium import webdriverbrowser=webdriver.Chrome() browser=webdriver.Firefox() browser=webdriver.PhantomJS() browser=webdriver.Safari() browser=webdriver.Edge()
安装 有界面浏览器 selenium+chromedriver 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 pip3 install selenium 下载chromdriver.exe放到python安装路径的scripts目录中即可,注意最新版本是2.38 ,并非2.9 国内镜像网站地址:http://npm.taobao.org/mirrors/chromedriver/2.38 / 最新的版本去官网找:https://sites.google.com/a/chromium.org/chromedriver/downloads C:\Users\Administrator>python3 Python 3.6 .1 (v3.6 .1 :69c0db5, Mar 21 2017 , 18 :41 :36 ) [MSC v.1900 64 bit (AMD64)] on win32 Type "help" , "copyright" , "credits" or "license" for more information.>>> from selenium import webdriver>>> driver=webdriver.Chrome() >>> driver.get('https://www.baidu.com' )>>> driver.page_sourceselenium3默认支持的webdriver是Firfox,而Firefox需要安装geckodriver 下载链接:https://github.com/mozilla/geckodriver/releases
无界面浏览器selenium+谷歌浏览器headless模式 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 from selenium import webdriverfrom selenium.webdriver.chrome.options import Optionschrome_options = Options() chrome_options.add_argument('window-size=1920x3000' ) chrome_options.add_argument('--disable-gpu' ) chrome_options.add_argument('--hide-scrollbars' ) chrome_options.add_argument('blink-settings=imagesEnabled=false' ) chrome_options.add_argument('--headless' ) chrome_options.binary_location = r"C:\Program Files (x86)\Google\Chrome\Application\chrome.exe" driver=webdriver.Chrome(chrome_options=chrome_options) driver.get('https://www.baidu.com' ) print ('hao123' in driver.page_source)driver.close()
基本使用 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 from selenium import webdriverfrom selenium.webdriver import ActionChainsfrom selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys from selenium.webdriver.support import expected_conditions as ECfrom selenium.webdriver.support.wait import WebDriverWait browser=webdriver.Chrome() try : browser.get('https://www.baidu.com' ) input_tag=browser.find_element_by_id('kw' ) input_tag.send_keys('美女' ) input_tag.send_keys(Keys.ENTER) wait=WebDriverWait(browser,10 ) wait.until(EC.presence_of_element_located((By.ID,'content_left' ))) print (browser.page_source) print (browser.current_url) print (browser.get_cookies()) finally : browser.close()
选择器 一 基本用法 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 from selenium import webdriverfrom selenium.webdriver import ActionChainsfrom selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys from selenium.webdriver.support import expected_conditions as ECfrom selenium.webdriver.support.wait import WebDriverWait import timedriver=webdriver.Chrome() driver.get('https://www.baidu.com' ) wait=WebDriverWait(driver,10 ) try : print (driver.find_element_by_id('kw' )) login=driver.find_elements_by_partial_link_text('录' )[0 ] login.click() print (driver.find_element_by_tag_name('a' )) button=wait.until(EC.element_to_be_clickable((By.CLASS_NAME,'tang-pass-footerBarULogin' ))) button.click() input_user=wait.until(EC.presence_of_element_located((By.NAME,'userName' ))) input_pwd=wait.until(EC.presence_of_element_located((By.NAME,'password' ))) commit=wait.until(EC.element_to_be_clickable((By.ID,'TANGRAM__PSP_10__submit' ))) input_user.send_keys('18611453110' ) input_pwd.send_keys('xxxxxx' ) commit.click() driver.find_element_by_css_selector('#kw' ) time.sleep(5 ) finally : driver.close()
二 xpath 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 from selenium import webdriverfrom selenium.webdriver import ActionChainsfrom selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys from selenium.webdriver.support import expected_conditions as ECfrom selenium.webdriver.support.wait import WebDriverWait import timedriver=webdriver.PhantomJS() driver.get('https://doc.scrapy.org/en/latest/_static/selectors-sample1.html' ) driver.implicitly_wait(3 ) try : driver.find_element_by_xpath('//body//a' ) driver.find_element_by_css_selector('body a' ) res1=driver.find_elements_by_xpath('//body//a[1]' ) print (res1[0 ].text) res1=driver.find_element_by_xpath('//a[5]' ) res2=driver.find_element_by_xpath('//a[@href="image5.html"]' ) res3=driver.find_element_by_xpath('//a[contains(@href,"image5")]' ) print ('==>' , res1.text) print ('==>' ,res2.text) print ('==>' ,res3.text) res1=driver.find_element_by_xpath('/html/body/div/a' ) print (res1.text) res2=driver.find_element_by_xpath('//a[img/@src="image3_thumb.jpg"]' ) print (res2.tag_name,res2.text) res3 = driver.find_element_by_xpath("//input[@name='continue'][@type='button']" ) res4 = driver.find_element_by_xpath("//*[@name='continue'][@type='button']" ) time.sleep(5 ) finally : driver.close()
三 获取标签属性 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 from selenium import webdriverfrom selenium.webdriver import ActionChainsfrom selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys from selenium.webdriver.support import expected_conditions as ECfrom selenium.webdriver.support.wait import WebDriverWait browser=webdriver.Chrome() browser.get('https://www.amazon.cn/' ) wait=WebDriverWait(browser,10 ) wait.until(EC.presence_of_element_located((By.ID,'cc-lm-tcgShowImgContainer' ))) tag=browser.find_element(By.CSS_SELECTOR,'#cc-lm-tcgShowImgContainer img' ) print (tag.get_attribute('src' ))print (tag.id )print (tag.location)print (tag.tag_name)print (tag.size)browser.close()
等待元素被加载 1 2 3 4 5 隐式等待:在browser.get('xxx' )前就设置,针对所有元素有效 显式等待:在browser.get('xxx' )之后设置,只针对某个元素有效
隐式等待 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 from selenium import webdriverfrom selenium.webdriver import ActionChainsfrom selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys from selenium.webdriver.support import expected_conditions as ECfrom selenium.webdriver.support.wait import WebDriverWait browser=webdriver.Chrome() browser.implicitly_wait(10 ) browser.get('https://www.baidu.com' ) input_tag=browser.find_element_by_id('kw' ) input_tag.send_keys('美女' ) input_tag.send_keys(Keys.ENTER) contents=browser.find_element_by_id('content_left' ) print (contents)browser.close()
显式等待 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 from selenium import webdriverfrom selenium.webdriver import ActionChainsfrom selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys from selenium.webdriver.support import expected_conditions as ECfrom selenium.webdriver.support.wait import WebDriverWait browser=webdriver.Chrome() browser.get('https://www.baidu.com' ) input_tag=browser.find_element_by_id('kw' ) input_tag.send_keys('美女' ) input_tag.send_keys(Keys.ENTER) wait=WebDriverWait(browser,10 ) wait.until(EC.presence_of_element_located((By.ID,'content_left' ))) contents=browser.find_element(By.CSS_SELECTOR,'#content_left' ) print (contents)browser.close()
元素交互操作 点击,清空 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 from selenium import webdriverfrom selenium.webdriver import ActionChainsfrom selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys from selenium.webdriver.support import expected_conditions as ECfrom selenium.webdriver.support.wait import WebDriverWait browser=webdriver.Chrome() browser.get('https://www.amazon.cn/' ) wait=WebDriverWait(browser,10 ) input_tag=wait.until(EC.presence_of_element_located((By.ID,'twotabsearchtextbox' ))) input_tag.send_keys('iphone 8' ) button=browser.find_element_by_css_selector('#nav-search > form > div.nav-right > div > input' ) button.click() import timetime.sleep(3 ) input_tag=browser.find_element_by_id('twotabsearchtextbox' ) input_tag.clear() input_tag.send_keys('iphone7plus' ) button=browser.find_element_by_css_selector('#nav-search > form > div.nav-right > div > input' ) button.click()
Action Chains 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 from selenium import webdriverfrom selenium.webdriver import ActionChainsfrom selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys from selenium.webdriver.support import expected_conditions as ECfrom selenium.webdriver.support.wait import WebDriverWait import timedriver = webdriver.Chrome() driver.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable' ) wait=WebDriverWait(driver,3 ) try : driver.switch_to.frame('iframeResult' ) sourse=driver.find_element_by_id('draggable' ) target=driver.find_element_by_id('droppable' ) ActionChains(driver).click_and_hold(sourse).perform() distance=target.location['x' ]-sourse.location['x' ] track=0 while track < distance: ActionChains(driver).move_by_offset(xoffset=2 ,yoffset=0 ).perform() track+=2 ActionChains(driver).release().perform() time.sleep(10 ) finally : driver.close()
在交互动作比较难实现的时候可以自己写JS(万能方法) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 from selenium import webdriverfrom selenium.webdriver import ActionChainsfrom selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys from selenium.webdriver.support import expected_conditions as ECfrom selenium.webdriver.support.wait import WebDriverWait try : browser=webdriver.Chrome() browser.get('https://www.baidu.com' ) browser.execute_script('alert("hello world")' ) finally : browser.close()
补充:frame的切换
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 from selenium import webdriverfrom selenium.webdriver import ActionChainsfrom selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys from selenium.webdriver.support import expected_conditions as ECfrom selenium.webdriver.support.wait import WebDriverWait try : browser=webdriver.Chrome() browser.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable' ) browser.switch_to.frame('iframeResult' ) tag1=browser.find_element_by_id('droppable' ) print (tag1) browser.switch_to.parent_frame() tag2=browser.find_element_by_id('textareaCode' ) print (tag2) finally : browser.close()
其他 模拟浏览器的前进后退 1 2 3 4 5 6 7 8 9 10 11 12 13 import timefrom selenium import webdriverbrowser=webdriver.Chrome() browser.get('https://www.baidu.com' ) browser.get('https://www.taobao.com' ) browser.get('http://www.sina.com.cn/' ) browser.back() time.sleep(10 ) browser.forward() browser.close()
cookies 1 2 3 4 5 6 7 8 9 10 from selenium import webdriverbrowser=webdriver.Chrome() browser.get('https://www.zhihu.com/explore' ) print (browser.get_cookies())browser.add_cookie({'k1' :'xxx' ,'k2' :'yyy' }) print (browser.get_cookies())
选项卡管理 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 import timefrom selenium import webdriverbrowser=webdriver.Chrome() browser.get('https://www.baidu.com' ) browser.execute_script('window.open()' ) print (browser.window_handles) browser.switch_to_window(browser.window_handles[1 ]) browser.get('https://www.taobao.com' ) time.sleep(10 ) browser.switch_to_window(browser.window_handles[0 ]) browser.get('https://www.sina.com.cn' ) browser.close()
异常处理 1 2 3 4 5 6 7 8 9 10 11 12 13 14 from selenium import webdriverfrom selenium.common.exceptions import TimeoutException,NoSuchElementException,NoSuchFrameExceptiontry : browser=webdriver.Chrome() browser.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable' ) browser.switch_to.frame('iframssseResult' ) except TimeoutException as e: print (e) except NoSuchFrameException as e: print (e) finally : browser.close()
项目练习 自动登录163邮箱并发送邮件 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 from selenium import webdriverfrom selenium.webdriver import ActionChainsfrom selenium.webdriver.common.by import Byfrom selenium.webdriver.common.keys import Keysfrom selenium.webdriver.support import expected_conditions as ECfrom selenium.webdriver.support.wait import WebDriverWaitbrowser=webdriver.Chrome() try : browser.get('http://mail.163.com/' ) wait=WebDriverWait(browser,5 ) frame=wait.until(EC.presence_of_element_located((By.ID,'x-URS-iframe' ))) browser.switch_to.frame(frame) wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'.m-container' ))) inp_user=browser.find_element_by_name('email' ) inp_pwd=browser.find_element_by_name('password' ) button=browser.find_element_by_id('dologin' ) inp_user.send_keys('18611453110' ) inp_pwd.send_keys('xxxx' ) button.click() wait.until(EC.presence_of_element_located((By.ID,'dvNavTop' ))) write_msg=browser.find_elements_by_css_selector('#dvNavTop li' )[1 ] write_msg.click() wait.until(EC.presence_of_element_located((By.CLASS_NAME,'tH0' ))) recv_man=browser.find_element_by_class_name('nui-editableAddr-ipt' ) title=browser.find_element_by_css_selector('.dG0 .nui-ipt-input' ) recv_man.send_keys('378533872@qq.com' ) title.send_keys('圣旨' ) print (title.tag_name) frame=wait.until(EC.presence_of_element_located((By.CLASS_NAME,'APP-editor-iframe' ))) browser.switch_to.frame(frame) body=browser.find_element(By.CSS_SELECTOR,'body' ) body.send_keys('egon很帅,可以加工资了' ) browser.switch_to.parent_frame() send_button=browser.find_element_by_class_name('nui-toolbar-item' ) send_button.click() import time time.sleep(10000 ) except Exception as e: print (e) finally : browser.close()
爬取京东商城商品信息 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 from selenium import webdriverfrom selenium.webdriver import ActionChainsfrom selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys from selenium.webdriver.support import expected_conditions as ECfrom selenium.webdriver.support.wait import WebDriverWait import timedef get_goods (driver ): try : goods=driver.find_elements_by_class_name('gl-item' ) for good in goods: detail_url=good.find_element_by_tag_name('a' ).get_attribute('href' ) p_name=good.find_element_by_css_selector('.p-name em' ).text.replace('\n' ,'' ) price=good.find_element_by_css_selector('.p-price i' ).text p_commit=good.find_element_by_css_selector('.p-commit a' ).text msg = ''' 商品 : %s 链接 : %s 价钱 :%s 评论 :%s ''' % (p_name,detail_url,price,p_commit) print (msg,end='\n\n' ) button=driver.find_element_by_partial_link_text('下一页' ) button.click() time.sleep(1 ) get_goods(driver) except Exception: pass def spider (url,keyword ): driver = webdriver.Chrome() driver.get(url) driver.implicitly_wait(3 ) try : input_tag=driver.find_element_by_id('key' ) input_tag.send_keys(keyword) input_tag.send_keys(Keys.ENTER) get_goods(driver) finally : driver.close() if __name__ == '__main__' : spider('https://www.jd.com/' ,keyword='iPhone8手机' )