增加爬虫开发脚本整理

pipidi

zhujingdi1998@gmail.com

1. 将Cookie转换成Python字典对象

直接上代码 首先隔开每个item 取等号前面 后面的合并成字典

class trancookie:  
    def __init__(self,cookie):  
        self.cookie=cookie  

    def stringtoDict(self):  
        itemDict={}  
        items =self.cookie.split(';')  
        for item in items:  
            key = item.split('=')[0].replace(' ','')  
            value=item.split('=')[1]  
            print(key)  
            print(value)  
            itemDict[key]=value  
        return itemDict  

cookie='phone=; password=; gr_user_id=bd56b5b9-3da0-49ea-aa05-bb84aabab3a4; gr_session_id_a0f0692c929ad91f=8da0f95c-5ba0-4981-9cfe-e7032c9cd1d1; JSESSIONID=F0BD0C528CC4911EF46456365B5C48EC'  
trans=trancookie(cookie)  
print(trans.stringtoDict())  

2. 将浏览器上获取的Ruquest header 转换成为字典对象

将浏览器上复制下来的头转换成字典,改来改去真的麻烦的一批

def to_python_zip(text):  
    items = text.split('\n')  
    head_zip={}  

    for item in items:  
        if 'Cookie' in item:  
            item=item.replace('Cookie:','')  
            head_zip['Cookie']=item  
        elif item.startswith(":"):  
            item=item.replace(':','',1)  
            key=item.split(':')  
            head_zip[':'+key[0]]=key[1]  
        else:  
            key=item.split(':')  
            head_zip[key[0]]=key[1]  
    print(head_zip)  
if __name__ == '__main__':  
    request_Head = ''''':authority:www.google.ca 
:method:GET 
:path:/xjs/_/js/k=xjs.s.zh_CN.RatPuRYNpgk.O/m=aa,abd,async,dvl,foot,fpe,ipv6,lu,m,mu,sf,sonic,spch,tl,vs,d3l,tnv,mrn,exdp,udlg,me,kptm,iud,iuci,shrb,dgm,qtf,tcc,atn/am=wCL0eMEByP8PAopEKwgsQJpgGBo/exm=sx,sb,cdos,cr,elog,hsm,jsa,r,d,csi/rt=j/d=1/ed=1/t=zcms/rs=ACT90oFUZZAh1baiB5gB9X__zAAB6iaufw?xjs=s1 
:scheme:https 
accept:*/* 
accept-encoding:gzip, deflate, br 
accept-language:zh,en-US;q=0.9,en;q=0.8 
cookie:1P_JAR=2018-02-21-11; DV=k_FBxNmnkCIVYCnKbBlF6KJvO3mCGxY; UULE=a+cm9sZToxIHByb2R1Y2VyOjEyIHByb3ZlbmFuY2U6NiB0aW1lc3RhbXA6MTUxOTIxMzA2MjY3NDAwMCBsYXRsbmd7bGF0aXR1ZGVfZTc6Mjk5MDA4NDc3IGxvbmdpdHVkZV9lNzoxMjE2MzU4NTUzfSByYWRpdXM6MTI0MDA=; NID=124=IFqIMdsjN2qpy2mn1bLf7If1nga5a_V_kVat4N9np8URiXLtHLv2_gLMFSUEbdDBz4Lz0sp1OaGwfeVPuehngEGPKKjdId6bKviZ7XVyfHqmvhm8tPVGcuOFHm4LL-QS 
referer:https://www.google.ca/ 
user-agent:Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119 Safari/537.36 
x-client-data:CLO1yQEIkbbJAQiitskBCKmdygEIqKPKAQ=='''  
    to_python_zip(request_Head)  
import re

import requests

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0',


}
#https://www.proxyrotator.com/free-proxy-list/3/#free-proxy-list
content=requests.get('https://www.proxynova.com/proxy-server-list/country-cn/',
                   headers=headers).text
from lxml import etree
selector=etree.HTML(content)
lis=[]
for i in selector.xpath('//*[@id="tbl_proxy_list"]/tbody[1]/tr'):
    if re.findall('\d+',"".join(i.xpath('./td[@align="left"]/text()'))) !=[]:
        ip=i.xpath('./td/abbr/@title')[0]
        schome=re.findall('\d+',"".join(i.xpath('./td[@align="left"]/text()')))[0]
        proxy=ip+':'+schome
        lis.append(proxy)
for i in lis:
    try:
        proxies={
            'http':i
        }

        print(requests.get('http://httpbin.org/ip',proxies=proxies).text)
    except Exception as e:
        print(e)

阅读量