python 我的爬虫工具类-python爬虫

这是一个自己撰写的爬虫工具类，作用包含：推送get/post要求获得网页页面,cookie实际操作，网页页面正则表达式和xpath分析，简易的检索网络爬虫。

除开lxml库全是基础python杜兰特就会有的物品。

要是没有这一库可以用pip安装，或是删掉from lxml import etree和getXpath方式

$ pip install lxml

编码：

#! /usr/bin/python
#coding=utf-8

import sys
import urllib
import urllib2
import re
import os
import cookielib
import json
from lxml import etree

class requestPars:
    PROXY = 'proxy'
    USER_AGENT = 'userAgent'
    DATA = 'data'
    COOKIE = 'cookie'



#通用性方式
class crawlerTool:
    #类的静态变量
    log=''
    def __init__(self):
        pass


    #基础的访问页面 輸出网页页面
    #getPage(url,data=xx)  getPage(url,requestPars.=xx)
    @staticmethod
    def getPage(url,proxy=None,data=None, referer = None ,cookie = None ,userAgent = None,cookiePath=None):
        # print url
        crawlerTool.log = crawlerTool.log url
        page_buf = ''
        i = 0
        for i in range(1):
            # print url
            try:
                if proxy:
                    handlers = [urllib2.ProxyHandler({'http': 'http://%s/' % proxy,'https': 'http://%s/' % proxy})]
                    opener = urllib2.build_opener(*handlers)
                else:
                    opener = urllib2.build_opener()
                method = urllib2.Request(url,data)
                if referer:
                    method.add_header('Referer', referer)
                if cookiePath:
                    method.add_header('Cookie', crawlerTool.readCookie(cookiePath))
                if cookie:
                    method.add_header('Cookie', cookie)
                if userAgent:
                    method.add_header('User-Agent',
                                      userAgent)
                else:
                    method.add_header('User-Agent',
                                  'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36')
                method.add_header('Accept-Language', 'en-US,en;q=0.5')
                result = opener.open(method, timeout=10)
                page_buf = result.read()
                return page_buf
            except urllib2.URLError, reason:
                crawlerTool.log = crawlerTool.log   str(reason)
                return str(reason)
            except Exception, reason:
                crawlerTool.log = crawlerTool.log   str(reason)
                raise Exception(reason)
        pass


    #getPageByPostJson data input is a dict
    #getPage(url,data=xx)  getPage(url,requestPars.=xx)
    @staticmethod
    def getPageByJson(url,proxy=None,data={}, referer = None ,cookie = None ,userAgent = None,cookiePath=None):
        # print url
        crawlerTool.log = crawlerTool.log url
        page_buf = ''
        i = 0
        for i in range(1):
            # print url
            try:
                if proxy:
                    handlers = [urllib2.ProxyHandler({'http': 'http://%s/' % proxy,'https': 'http://%s/' % proxy})]
                    opener = urllib2.build_opener(*handlers)
                else:
                    opener = urllib2.build_opener()
                if type(data) == type({}):data=json.dumps(data)
                method = urllib2.Request(url,data=data)#要留意None相匹配null
                method.add_header('Content-Type','application/json')
                if referer:
                    method.add_header('Referer', referer)
                if cookiePath:
                    method.add_header('Cookie', crawlerTool.readCookie(cookiePath))
                if cookie:
                    method.add_header('Cookie', cookie)
                if userAgent:
                    method.add_header('User-Agent', userAgent)
                else:
                    method.add_header('User-Agent',
                                  'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36')
                method.add_header('Accept-Language', 'en-US,en;q=0.5')
                result = opener.open(method, timeout=10)
                page_buf = result.read()
                return page_buf
            except urllib2.URLError, reason:
                crawlerTool.log = crawlerTool.log   str(reason)
                return str(reason)
            except Exception, reason:
                crawlerTool.log = crawlerTool.log   str(reason)
                raise Exception(reason)
        pass


    #获得正则表达式的第一个配对
    @staticmethod
    def getRegex(pattern,content):
        group = re.search(pattern, content)
        if group:
            return group.groups()[0]
        else:
            return ''


    # 获得xpath 要分辨一下键入种类，或是错误处理
    @staticmethod
    def getXpath(xpath, content):
        tree = etree.HTML(content)
        out = []
        results = tree.xpath(xpath)
        for result in results:
            if  'ElementStringResult' in str(type(result)) :
                out.append(result)
            else:
                out.append(etree.tostring(result))
        return out


    # 获得自动跳转连接
    @staticmethod
    def getDirectUrl(url):
        u = urllib2.urlopen(url)
        redirectUrl = u.geturl()
        return redirectUrl


    #輸出网页页面的各种各样信息内容 輸出词典
    @staticmethod
    def getPageDetail(url,proxy=None,data=None, referer = None ,cookie = None ,userAgent = None,cookiePath=None):
        PageDetail = {}
        #print url
        crawlerTool.log = crawlerTool.log url '\n'
        page_buf = ''
        n = 1
        for i in range(n):
            # print url
            try:
                getCookie = cookielib.CookieJar()
                cookieHandler = urllib2.HTTPCookieProcessor(getCookie)
                if proxy:
                    handlers = [urllib2.ProxyHandler({'http': 'http://%s/' % proxy,'https': 'http://%s/' % proxy}),cookieHandler]
                    opener = urllib2.build_opener(*handlers)
                else:
                    opener = urllib2.build_opener(cookieHandler)
                method = urllib2.Request(url,data)
                if referer:
                    method.add_header('Referer', referer)
                if cookiePath:
                    method.add_header('Cookie', crawlerTool.readCookie(cookiePath))
                if cookie:
                    method.add_header('Cookie', cookie)
                if userAgent:
                    method.add_header('User-Agent',
                                      userAgent)
                else:
                    method.add_header('User-Agent',
                                  'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36')
                method.add_header('Accept-Language', 'en-US,en;q=0.5')
                result = opener.open(method, timeout=10)
                #print str(result.headers)
                page_buf = result.read()


                PageDetail['pageContent']=page_buf
                PageDetail['code'] = 200
                cookie_str = ''
                for item in getCookie:
                    cookie_str  = item.name   "="   item.value   "; "
                PageDetail['cookie'] = cookie_str
                #print 'getcookie:' cookie_str


                break
            except urllib2.HTTPError, e:
                #print e.reason
                PageDetail['code'] = e.code
                PageDetail['cookie'] =e.headers.get('Set-Cookie','')  #这儿是由于百度搜索403错误依然必须取cookie
                #print e.headers.get('Set-Cookie','')


            except urllib2.URLError, reason:
                crawlerTool.Log = crawlerTool.log   str(reason)
                #print reason.read()
                PageDetail['code'] = 1003
                #print 'URLError' str(reason)
                break
            except Exception, reason:
                if i == n:
                    crawlerTool.Log = crawlerTool.log   str(reason)
                    #print 'Error' str(reason)
                    break


        return PageDetail


    #储存cookie 假如途径不会有就新创建  要不是必须分离写可以用cookielib.MozillaCookieJar(filename)
    @staticmethod
    def saveCookie(cookie,path):
        if os.path.isdir(path):
            crawlerTool.log = crawlerTool.log 'path cant be dir\n'
            sys.exit(0)
        try:
            if not os.path.exists(path):
                parent_path = os.path.dirname(path)
                if not os.path.exists(parent_path):os.makedirs(parent_path)  #创建联级文件目录
                with open(path,'w') as f:
                    f.write(cookie)
            else:
                with open(path,'w') as f:
                    f.write(cookie)
        except:
            sys.exit(0)



    # 载入cookie
    @staticmethod
    def readCookie(path):
        if not os.path.isfile(path):
            crawlerTool.log =crawlerTool.log 'cookie not find\n'
            return ''
        else:
            with open(path,'r') as f:
                return f.read()
        pass




def keywordSearch(maxPageNum,keyword,proxy=''):
    try:
        #print proxy
        #print keyword,'do list search'
        keyword = keyword.replace(' ',' ')
        pageNum = 0
        urlListDepth0 = []
        urlDepth0 = 'https://www.youtube.com/results?search_query=' keyword
        finalResult = []
        for pageNum in range(maxPageNum):


            pageDepth0 = crawlerTool.getPage(urlDepth0,proxy=proxy)
            #print pageDepth0
            urlDepth1 =  re.findall('class="yt-lockup-title\s*"><a href="(/watch\?v=[\w_-] &amp;list=[^"] )"',pageDepth0)
            urlDepth0 = 'https://www.youtube.com' crawlerTool.getRegex('<a href="(.*?)"[^>] "><span class="yt-uix-button-content">Next',pageDepth0)
            #print urlDepth0
            urlListDepth1 = []
            for url in urlDepth1:
                url = url.replace('&amp;','&')
                url = 'https://www.youtube.com' url
                if not url in urlListDepth1:
                    #print url
                    urlListDepth1.append(url)
            #print urlListDepth1,len(urlListDepth1)
            urlListDepth2 = []
            for url in urlListDepth1:
                #print 'open listUrl:',url
                pageDepth1 = crawlerTool.getPage(url,proxy=proxy).replace('&amp;','&')
                urlDepth2  =re.findall('(/watch\?v=[^"]*)\&index=\d ',pageDepth1)
                for urlDepth2 in urlDepth2:
                    if not urlDepth2 in urlListDepth2:
                        urlDepth2 = 'http://www.youtube.com' urlDepth2
                        finalResult.append(urlDepth2)
                        #print urlDepth2
                        urlListDepth2.append(urlDepth2)
        #print len(finalResult),finalResult
        return finalResult
    except:
        print 'do listSearch failed'


#必须输入关键词和较大 页码 輸出hostingurl目录 这脚本制作只遮盖playlist连接


def main():
  pass


if __name__ == '__main__':


    ct=crawlerTool()
    data=       {
     "keyid": "abcdefghijk2ml2n83",
     "website": "Kuwo",
     "url": "http://www.filebox.com",
     "author":"bb",
     "author_url": "http://www.filebox.com/?v=293280JUN0102",
     "post_date": "2015-03-20 1:12:50",
      "hide_flag2" : 0,
     "duration":225
   }
    print json.dumps(data)
    print ct.getPageByJson('http://192.168.1.72:8080/VTServiceFK/service/updateVideoInfo',data=data)
    sys.exit()
    print ct.getDirectUrl('http://v.qq.com/page/c/b/4/c0361j0fab4.html')
    keywordSearch(1,"simpsons full episode")

文中为CSDN时尚博主「Memory_qq312141830」的原创文章内容
全文连接：https ://blog.csdn.net/Memory_and_Dream/article/details/72917848

关键字：