这是一个自己撰写的爬虫工具类,作用包含:推送get/post要求获得网页页面,cookie实际操作,网页页面正则表达式和xpath分析,简易的检索网络爬虫。
除开lxml库全是基础python杜兰特就会有的物品。
要是没有这一库可以用pip安装,或是删掉from lxml import etree和getXpath方式
$ pip install lxml
编码:
#! /usr/bin/python
#coding=utf-8
import sys
import urllib
import urllib2
import re
import os
import cookielib
import json
from lxml import etree
class requestPars:
PROXY = 'proxy'
USER_AGENT = 'userAgent'
DATA = 'data'
COOKIE = 'cookie'
#通用性方式
class crawlerTool:
#类的静态变量
log=''
def __init__(self):
pass
#基础的访问页面 輸出网页页面
#getPage(url,data=xx) getPage(url,requestPars.=xx)
@staticmethod
def getPage(url,proxy=None,data=None, referer = None ,cookie = None ,userAgent = None,cookiePath=None):
# print url
crawlerTool.log = crawlerTool.log url
page_buf = ''
i = 0
for i in range(1):
# print url
try:
if proxy:
handlers = [urllib2.ProxyHandler({'http': 'http://%s/' % proxy,'https': 'http://%s/' % proxy})]
opener = urllib2.build_opener(*handlers)
else:
opener = urllib2.build_opener()
method = urllib2.Request(url,data)
if referer:
method.add_header('Referer', referer)
if cookiePath:
method.add_header('Cookie', crawlerTool.readCookie(cookiePath))
if cookie:
method.add_header('Cookie', cookie)
if userAgent:
method.add_header('User-Agent',
userAgent)
else:
method.add_header('User-Agent',
'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36')
method.add_header('Accept-Language', 'en-US,en;q=0.5')
result = opener.open(method, timeout=10)
page_buf = result.read()
return page_buf
except urllib2.URLError, reason:
crawlerTool.log = crawlerTool.log str(reason)
return str(reason)
except Exception, reason:
crawlerTool.log = crawlerTool.log str(reason)
raise Exception(reason)
pass
#getPageByPostJson data input is a dict
#getPage(url,data=xx) getPage(url,requestPars.=xx)
@staticmethod
def getPageByJson(url,proxy=None,data={}, referer = None ,cookie = None ,userAgent = None,cookiePath=None):
# print url
crawlerTool.log = crawlerTool.log url
page_buf = ''
i = 0
for i in range(1):
# print url
try:
if proxy:
handlers = [urllib2.ProxyHandler({'http': 'http://%s/' % proxy,'https': 'http://%s/' % proxy})]
opener = urllib2.build_opener(*handlers)
else:
opener = urllib2.build_opener()
if type(data) == type({}):data=json.dumps(data)
method = urllib2.Request(url,data=data)#要留意None相匹配null
method.add_header('Content-Type','application/json')
if referer:
method.add_header('Referer', referer)
if cookiePath:
method.add_header('Cookie', crawlerTool.readCookie(cookiePath))
if cookie:
method.add_header('Cookie', cookie)
if userAgent:
method.add_header('User-Agent', userAgent)
else:
method.add_header('User-Agent',
'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36')
method.add_header('Accept-Language', 'en-US,en;q=0.5')
result = opener.open(method, timeout=10)
page_buf = result.read()
return page_buf
except urllib2.URLError, reason:
crawlerTool.log = crawlerTool.log str(reason)
return str(reason)
except Exception, reason:
crawlerTool.log = crawlerTool.log str(reason)
raise Exception(reason)
pass
#获得正则表达式的第一个配对
@staticmethod
def getRegex(pattern,content):
group = re.search(pattern, content)
if group:
return group.groups()[0]
else:
return ''
# 获得xpath 要分辨一下键入种类,或是错误处理
@staticmethod
def getXpath(xpath, content):
tree = etree.HTML(content)
out = []
results = tree.xpath(xpath)
for result in results:
if 'ElementStringResult' in str(type(result)) :
out.append(result)
else:
out.append(etree.tostring(result))
return out
# 获得自动跳转连接
@staticmethod
def getDirectUrl(url):
u = urllib2.urlopen(url)
redirectUrl = u.geturl()
return redirectUrl
#輸出网页页面的各种各样信息内容 輸出词典
@staticmethod
def getPageDetail(url,proxy=None,data=None, referer = None ,cookie = None ,userAgent = None,cookiePath=None):
PageDetail = {}
#print url
crawlerTool.log = crawlerTool.log url '\n'
page_buf = ''
n = 1
for i in range(n):
# print url
try:
getCookie = cookielib.CookieJar()
cookieHandler = urllib2.HTTPCookieProcessor(getCookie)
if proxy:
handlers = [urllib2.ProxyHandler({'http': 'http://%s/' % proxy,'https': 'http://%s/' % proxy}),cookieHandler]
opener = urllib2.build_opener(*handlers)
else:
opener = urllib2.build_opener(cookieHandler)
method = urllib2.Request(url,data)
if referer:
method.add_header('Referer', referer)
if cookiePath:
method.add_header('Cookie', crawlerTool.readCookie(cookiePath))
if cookie:
method.add_header('Cookie', cookie)
if userAgent:
method.add_header('User-Agent',
userAgent)
else:
method.add_header('User-Agent',
'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36')
method.add_header('Accept-Language', 'en-US,en;q=0.5')
result = opener.open(method, timeout=10)
#print str(result.headers)
page_buf = result.read()
PageDetail['pageContent']=page_buf
PageDetail['code'] = 200
cookie_str = ''
for item in getCookie:
cookie_str = item.name "=" item.value "; "
PageDetail['cookie'] = cookie_str
#print 'getcookie:' cookie_str
break
except urllib2.HTTPError, e:
#print e.reason
PageDetail['code'] = e.code
PageDetail['cookie'] =e.headers.get('Set-Cookie','') #这儿是由于百度搜索403错误依然必须取cookie
#print e.headers.get('Set-Cookie','')
except urllib2.URLError, reason:
crawlerTool.Log = crawlerTool.log str(reason)
#print reason.read()
PageDetail['code'] = 1003
#print 'URLError' str(reason)
break
except Exception, reason:
if i == n:
crawlerTool.Log = crawlerTool.log str(reason)
#print 'Error' str(reason)
break
return PageDetail
#储存cookie 假如途径不会有就新创建 要不是必须分离写可以用cookielib.MozillaCookieJar(filename)
@staticmethod
def saveCookie(cookie,path):
if os.path.isdir(path):
crawlerTool.log = crawlerTool.log 'path cant be dir\n'
sys.exit(0)
try:
if not os.path.exists(path):
parent_path = os.path.dirname(path)
if not os.path.exists(parent_path):os.makedirs(parent_path) #创建联级文件目录
with open(path,'w') as f:
f.write(cookie)
else:
with open(path,'w') as f:
f.write(cookie)
except:
sys.exit(0)
# 载入cookie
@staticmethod
def readCookie(path):
if not os.path.isfile(path):
crawlerTool.log =crawlerTool.log 'cookie not find\n'
return ''
else:
with open(path,'r') as f:
return f.read()
pass
def keywordSearch(maxPageNum,keyword,proxy=''):
try:
#print proxy
#print keyword,'do list search'
keyword = keyword.replace(' ',' ')
pageNum = 0
urlListDepth0 = []
urlDepth0 = 'https://www.youtube.com/results?search_query=' keyword
finalResult = []
for pageNum in range(maxPageNum):
pageDepth0 = crawlerTool.getPage(urlDepth0,proxy=proxy)
#print pageDepth0
urlDepth1 = re.findall('class="yt-lockup-title\s*"><a href="(/watch\?v=[\w_-] &list=[^"] )"',pageDepth0)
urlDepth0 = 'https://www.youtube.com' crawlerTool.getRegex('<a href="(.*?)"[^>] "><span class="yt-uix-button-content">Next',pageDepth0)
#print urlDepth0
urlListDepth1 = []
for url in urlDepth1:
url = url.replace('&','&')
url = 'https://www.youtube.com' url
if not url in urlListDepth1:
#print url
urlListDepth1.append(url)
#print urlListDepth1,len(urlListDepth1)
urlListDepth2 = []
for url in urlListDepth1:
#print 'open listUrl:',url
pageDepth1 = crawlerTool.getPage(url,proxy=proxy).replace('&','&')
urlDepth2 =re.findall('(/watch\?v=[^"]*)\&index=\d ',pageDepth1)
for urlDepth2 in urlDepth2:
if not urlDepth2 in urlListDepth2:
urlDepth2 = 'http://www.youtube.com' urlDepth2
finalResult.append(urlDepth2)
#print urlDepth2
urlListDepth2.append(urlDepth2)
#print len(finalResult),finalResult
return finalResult
except:
print 'do listSearch failed'
#必须输入关键词和较大 页码 輸出hostingurl目录 这脚本制作只遮盖playlist连接
def main():
pass
if __name__ == '__main__':
ct=crawlerTool()
data= {
"keyid": "abcdefghijk2ml2n83",
"website": "Kuwo",
"url": "http://www.filebox.com",
"author":"bb",
"author_url": "http://www.filebox.com/?v=293280JUN0102",
"post_date": "2015-03-20 1:12:50",
"hide_flag2" : 0,
"duration":225
}
print json.dumps(data)
print ct.getPageByJson('http://192.168.1.72:8080/VTServiceFK/service/updateVideoInfo',data=data)
sys.exit()
print ct.getDirectUrl('http://v.qq.com/page/c/b/4/c0361j0fab4.html')
keywordSearch(1,"simpsons full episode")
文中为CSDN时尚博主「Memory_qq312141830」的原创文章内容
全文连接:https ://blog.csdn.net/Memory_and_Dream/article/details/72917848