一、Requests

参考 :http://www.python-requests.org/en/master/user/quickstart/#make-a-request

Requests是一个很实用的Python HTTP客户端库,编写爬虫和测试服务器响应数据时经常会用到。Requests 完全满足如今网络的需求

安装方式一般采用 pip install requests

In [1]: import requestsIn [2]: response=requests.get('https://api.github.com/events')In [3]: print(response)
In [4]: response=requests.post('http://httpbin.org/post',data={'key1':'values1'})         #提交表单时使用In [5]: print(response)
In [7]: response=requests.put('http://httpbin.org/put',data={'key1':'values1'})In [8]: print(response)
In [10]: response=requests.delete('http://httpbin.org/delete')In [11]: print(response)
In [13]: response=requests.head('http://httpbin.org/get')In [14]: print(response)
In [15]: response=requests.options('http://httpbin.org/get')  In [16]: print(response)
In [17]: payload={'key1':'value1','key2':'value2'}In [18]: response=requests.get('http://httpbin.org/get',params=payload)   #携带参数发送get请求In [19]: print(response)
In [20]: print(response.text){  "args": {    "key1": "value1",    "key2": "value2"  },  "headers": {    "Accept": "*/*",    "Accept-Encoding": "gzip, deflate",    "Connection": "close",    "Host": "httpbin.org",    "User-Agent": "python-requests/2.18.4"  },  "origin": "103.215.2.233",  "url": "http://httpbin.org/get?key1=value1&key2=value2"}In [22]: print(response.url)http://httpbin.org/get?key1=value1&key2=value2In [23]: payload={'key1':'value1','key2':['value2','value3']}In [24]: response=requests.get('http://httpbin.org/get',params=payload)In [25]: print(response.url)http://httpbin.org/get?key1=value1&key2=value2&key2=value3In [27]: response=requests.get('http://api.github.com/events')In [28]: response.encoding              #字符集编码Out[28]: 'utf-8'In [29]: print(response.text)  #文件信息[{"id":"6850814749","type":"CreateEvent","actor":{"id":679017,"login":......In [30]: print(response.content)        #二进制格式信息b'[{"id":"6850814749","type":"CreateEvent","actor":{"id":679017,"login":".....In [34]: response.json()In [36]: response.status_code           #返回状态码Out[36]: 200In [38]: headers={ 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.    ...: 0.3202.75 Safari/537.36','Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,p_w_picpath/webp,p_w_picpath/apng,*/*;q=0.8'    ...: ,'Accept-Encoding':'gzip, deflate, br','Accept-Language':'zh-CN,zh;q=0.9,en;q=0.8','Connection':'keep-alive'}In [39]: response=requests.get('https://api.github.com/events',headers=headers)In [40]: print(response.headers){'Server': 'GitHub.com', 'Date': 'Tue, 14 Nov 2017 06:10:31 GMT', 'Content-Type': 'application/json; charset=utf-8', 'Transfer-Encoding': 'chunked', 'Status': '200 OK', 'X-RateLimit-Limit': '60', 'X-RateLimit-Remaining': '58', 'X-RateLimit-Reset': '1510642339', 'Cache-Control': 'public, max-age=60, s-maxage=60', 'Vary': 'Accept', 'ETag': 'W/"34b51a08c5a8f4fa2400dd5c0d89221b"', 'Last-Modified': 'Tue, 14 Nov 2017 06:10:31 GMT', 'X-Poll-Interval': '60', 'X-GitHub-Media-Type': 'unknown, github.v3', 'Link': '
; rel="next", 
; rel="last"', 'Access-Control-Expose-Headers': 'ETag, Link, Retry-After, X-GitHub-OTP, X-RateLimit-Limit, X-RateLimit-Remaining, X-RateLimit-Reset, X-OAuth-Scopes, X-Accepted-OAuth-Scopes, X-Poll-Interval', 'Access-Control-Allow-Origin': '*', 'Content-Security-Policy': "default-src 'none'", 'Strict-Transport-Security': 'max-age=31536000; includeSubdomains; preload', 'X-Content-Type-Options': 'nosniff', 'X-Frame-Options': 'deny', 'X-XSS-Protection': '1; mode=block', 'X-Runtime-rack': '0.104190', 'Content-Encoding': 'gzip', 'X-GitHub-Request-Id': 'D528:C0F5:6BAAA:E4CB6:5A0A88D6'}In [41]:In [43]: print(response.headers['Content-Type'])application/json; charset=utf-8In [44]: print(response.headers.get('Content-Type'))application/json; charset=utf-8In [45]: url='http://www.baidu.com'In [46]: response=requests.get(url,headers=headers)           #向baidu请求会有cookies返回,有些site没有cookiesIn [47]: print(response.cookies)                              #输出整个cookies
]>In [48]: for k,v in response.cookies.get_dict().items():      #遍历cookies内容    ...:     print(k,v)    ...:H_PS_PSSID 1425_21088_24880BDSVRTM 0BD_HOME 0In [49]: cookies={'c1':'v1','c2':'v2'}In [50]: response=requests.get('http://httpbin.org/cookies',cookies=cookies)  #携带cookies发送请求In [52]: print(response.text){  "cookies": {    "c1": "v1",    "c2": "v2"  }}In [53]: jar = requests.cookies.RequestsCookieJar()In [54]: jar.set('tasty_cookie', 'yum', domain='httpbin.org', path='/cookies')Out[54]: Cookie(version=0, name='tasty_cookie', value='yum', port=None, port_specified=False, domain='httpbin.org', domain_specified=True, domain_initial_dot=False, path='/cookies', path_specified=True, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False)In [55]: jar.set('gross_cookie', 'blech', domain='httpbin.org', path='/elsewhere')Out[55]: Cookie(version=0, name='gross_cookie', value='blech', port=None, port_specified=False, domain='httpbin.org', domain_specified=True, domain_initial_dot=False, path='/elsewhere', path_specified=True, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False)In [56]: url = 'http://httpbin.org/cookies'In [57]: response = requests.get(url, cookies=jar)In [58]: print(response.text){  "cookies": {    "tasty_cookie": "yum"  }}

Cookies are returned in a RequestsCookieJar, which acts like a dict but also offers a more complete interface, suitable for use over multiple domains or paths. Cookie jars can also be passed in to requests

In [62]: url='http://github.com'In [64]: response=requests.get(url,allow_redirects=True)In [65]: print(response.url)https://github.com/In [66]: response.historyOut[66]: [
]In [69]: url = 'http://httpbin.org/post'In [70]: files = {'file': open('test.txt', 'rb')}In [71]: response=requests.post(url,files=files)                 #post提交时携带文件In [72]: response.textOut[72]: '...文件的内容...'In [73]: response=requests.get('https://github.com', timeout=5)   #关于请求超时

import json

import requests

from io import BytesIO

from PIL import Image

#1 处理图片

r=requests.get('http://img.jrjimg.cn/2013/11/20131105065502114.jpg')p_w_picpath=Image.open(BytesIO(r.content))  #从图片的二进制内容 生成一张图片p_w_picpath.save('mm.jpg')

#2 Json 处理josn

r=requests.get('https://github.com/timeline.json')print(type(r.json))print(r.json)print(r.text)

#3 org data 处理源数据

r=requests.get('https://timgsa.baidu.com/timg?p_w_picpath&quality=80&size=b9999_10000&sec=1508166336374&di=ef1073a52a7582f29ffa27c47e95e74e&imgtype=0&src=http%3A%2F%2Fp3.gexing.com%2FG1%2FM00%2F3F%2FDD%2FrBACE1MaezngiEoIAADSr3bccSw151.jpg')with open('mm2.jpg','wb+') as f:    for chunk in r.iter_content(1024):        f.write(chunk)

#4 Form 处理表单

form={'username':'user','password':'pwd'}r=requests.post('http://httpbin.org/post',data=form)print(r.text)r=requests.post('http://httpbin.org/post',data=json.dumps(form))print(r.text)

二、通过Requests抓取豆瓣电影列表及评分

所以抓取代码如下:

import requestsfrom lxml import etreesess = requests.Session()headers={ 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36','Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,p_w_picpath/webp,p_w_picpath/apng,*/*;q=0.8','Accept-Encoding':'gzip, deflate, br','Accept-Language':'zh-CN,zh;q=0.9,en;q=0.8','Connection':'keep-alive'}for id in range(0, 250, 25):    url = 'https://movie.douban.com/top250/?start=' + str(id)    r = sess.get(url,headers=headers)    r.encoding = 'utf-8'    #fname="movie"+str(id)+".txt"    #with open(fname,"wb+") as f:    #    f.write(r.content)    root = etree.HTML(r.content)  #使用lxml解析器对html文档解析    items = root.xpath('//ol/li/div[@class="item"]')    for item in items:        title = item.xpath('./div[@class="info"]//a/span[@class="title"]/text()')        name = title[0].encode('gb2312', 'ignore').decode('gb2312')        rating = item.xpath('.//div[@class="bd"]//span[@class="rating_num"]/text()')[0]        rating = item.xpath('.//div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()')[0]        print(name, rating)

三、BeautifulSoup

BeautifulSoup模块用于接收一个HTMLXML字符串,然后将其进行格式化,之后便可以使用他提供的方法进行快速查找指定元素,从而使得在HTMLXML中查找指定元素变得简单。Beautiful Soup 支持 Python 标准库中的 HTML 解析器,还支持一些第三方的解析器,如果不安装第三方解析器,Python 会使用默认的解析器。常见的解析器有:lxml, html5lib, html.parser其中lxml 解析器更加强大,速度更快,推荐安装。

from bs4 import BeautifulSoupsoup=BeautifulSoup(open('test.html'))  #这种方式适用于打开本地文件进行解析print(soup.prettify())  #格式化输出

#1 Tag 处理tag

print(type(soup.title))print(soup.title)print(soup.title.name)

#2 String

print(type(soup.title.string))print(soup.title.string)

#3 Comment

print(type(soup.a.string))print(soup.a.string)for item in soup.body.contents:    print(item.name)

#4 CSS query

print(soup.select('.sister'))print(soup.select('#link1'))print(soup.select('head > title'))
a_s=soup.select('a')for a in a_s:    print(a)

例:

from bs4 import BeautifulSouphtml_doc = """The Dormouse's storyasdf    
        
The Dormouse's story总共        

f

    
Once upon a time there were three little sisters; and their names were    
Els
fie,    
Lacie and    
Tillie;and they lived at the bottom of a well.ad
sf
...

"""
soup = BeautifulSoup(html_doc, features="lxml")tag1 = soup.find(name='a')         #find first tag atag2 = soup.find_all(name='a')     #find all tag atag3 = soup.select('#link2')       #find id=link2 label
print(tag1.name)          # 输出 aprint(tag1.attrs)         # 输出 字典 {'class': ['sister0'], 'id': 'link1'}tag1.attrs['id']='link01'print(tag1.attrs)          # 输出 字典 {'class': ['sister0'], 'id': 'link01'}print(tag1.has_attr('id')) # 输出 Trueprint(tag1.get_text('id')) # 输出  Elsidfidietag1.name='soup'           # 设置name 的值print(tag2)                # 输出 [
Els
fie, ......]print(tag2[0].name)        # 输出 soup

# decode,转换为字符串(含当前标签);decode_contents(不含当前标签)

print(tag2[1])                  # 输出 
Lacieprint(type(tag2[1]))            # 输出 
print(tag2[1].decode())         # 输出  
Lacieprint(type(tag2[1].decode()))   # 输出 

# encode,转换为字节(含当前标签);encode_contents(不含当前标签)

print(tag2[1].encode())            # 输出 b'
Lacie'print(type(tag2[1].encode()))      # 输出 
print(tag2[1].get_text())          # 输出 Lacie
body = soup.find(name='body')       #所有子标签childs = body.childrenprint(childs)                       # 输出 
for tag in childs:    print(tag)
body=soup.find(name='body')         #所有子子孙孙标签descs=body.descendants              # 输出 
print(descs)for des in descs:    print(des)
body=soup.find(name='body')         # 将标签的所有子标签全部清空 , 保留本标签名body.clear()print(soup)
body=soup.find(name='body')body.decompose()                    # 递归的删除所有的标签 print(soup)
body=soup.find(name='body')d=body.extract()                    # 递归的删除所有的标签,并获取删除的标签print(soup)print(d)
body=soup.find(name='body')index=body.index(body.find('div'))  # 输出 1 ,  检查标签在某标签中的索引位置print(index)
br=soup.find(name='br')test=br.is_empty_element     # 输出True ; 判断是否是如下标签:'br' , 'hr', 'input', 'img', 'meta','spacer', 'link', 'frame', 'base'print(test)
span=soup.find('span')print(span)                  # 输出 fprint(span.string)           # 输出 fspan.string='yeecall.com'    # 设置 stringprint(span.string)           # 输出 yeecall.com
body=soup.find(name='body')texts=body.stripped_strings  # 递归内部获取所有标签的文本print(texts)                 # 输出 
for text in texts:    print(text)

# Select CSS 选择器的举例

soup.select("title")soup.select("p nth-of-type(3)")soup.select("body a")soup.select("html head title")tag = soup.select("span,a")soup.select("head > title")soup.select("p > a")soup.select("p > a:nth-of-type(2)")soup.select("p > #link1")soup.select("body > a")soup.select("#link1 ~ .sister")soup.select("#link1 + .sister")soup.select(".sister")soup.select("[class~=sister]")soup.select("#link1")soup.select("a#link2")soup.select('a[href]')soup.select('a[href="http://example.com/elsie"]')soup.select('a[href^="http://example.com/"]')soup.select('a[href$="tillie"]')soup.select('a[href*=".com/el"]')

四、使用requests 、BeautifulSoup实现豆瓣登录

登录的部分源html如下:

验证码的部分源html如下:

所以登录代码如下:

import requestsimport html5libimport refrom bs4 import BeautifulSoupsess=requests.Session()url_login='https://accounts.douban.com/login'formdata={    'redir':'https://www.douban.com',    'source':'index_nav',    'form_email':'******@*****.com',    'form_password':'*********',    'login':u'登录'}headers={'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}r=sess.post(url_login,data=formdata,headers=headers)content=r.textsoup=BeautifulSoup(content,'html5lib')captcha=soup.find('img',id='captcha_p_w_picpath')if captcha:    print(captcha)    captcha_url=captcha['src']    #re_captcha_id=r'id="(.*?)"&'    #captcha_id=re.findall(re_captcha_id,captcha)    captcha_id=re.findall(r'(id=)(.*)(&)',captcha_url)    captcha_id=captcha_id[0][1]    print(captcha_url)    print(captcha_id)    captcha_text=input('Please input the captcha:')    formdata['captcha-solution']=captcha_text    formdata['captcha-id']=captcha_id    print(formdata)    r=sess.post(url_login,data=formdata,headers=headers)with open('contacts.txt','w+',encoding='utf-8') as f:    f.write(r.text)

以上仅为个人学习笔记,高手指点且勿喷