如何用Python编写爬虫程序抓取网页内容?

2026-06-09 14:386阅读0评论SEO资源
  • 内容介绍
  • 文章标签
  • 相关推荐

本文共计1915个文字,预计阅读时间需要8分钟。

如何用Python编写爬虫程序抓取网页内容?

%E2%80%9C%E4%BD%BF%E7%94%A8Mechanize%E5%BA%93%E6%B5%8F%E8%A7%88%E7%BD%91%E9%A1%B5%E2%80%9D

Mechanize库浏览页面

#!/usr/bin/python
#coding=utf-8
import mechanize

def viewPage(url):
browser = mechanize.Browser()
page = browser.open(url)
source_code = page.read()
print source_code

viewPage('www.imooc.com/')

使用代理服务器、User-Agent和cookie:

#!/usr/bin/python
#coding=utf-8
import mechanize

def testProxy(url, proxy):
browser = mechanize.Browser()
browser.set_proxies(proxy)
page = browser.open(url)
source_code = page.read()
print source_code

url = '2017.ip138.com/ic.asp'
hideMeProxy = {'whatismyuseragent.dotdoh.com/'
userAgent = [('User-agent', 'Mozilla/5.0 (X11; U; Linux 2.4.2-2 i586; en-US; m18) Gecko/20010131 Netscape6/6.01')]
testUserAgent(url, userAgent)

把代码集成在Python类的AnonBrowser中

#!/usr/bin/python
#coding=utf-8
import mechanize
import cookielib
import random

class anonBrowser(mechanize.Browser):
def __init__(self, proxies = [], user_agents = []):
mechanize.Browser.__init__(self)
self.set_handle_robots(False)
# 可供用户使用的代理服务器列表
self.proxies = proxies
# user_agent列表
self.user_agents = user_agents + ['Mozilla/4.0 ', 'FireFox/6.01','ExactSearch', 'Nokia7110/1.0']
self.cookie_jar = cookielib.LWPCookieJar()
self.set_cookiejar(self.cookie_jar)
self.anonymize()

# 清空cookie
def clear_cookies(self):
self.cookie_jar = cookielib.LWPCookieJar()
self.set_cookiejar(self.cookie_jar)

# 从user_agent列表中随机设置一个user_agent
def change_user_agent(self):
index = random.randrange(0, len(self.user_agents) )
self.addheaders = [('User-agent', ( self.user_agents[index] ))]

# 从代理列表中随机设置一个代理
def change_proxy(self):
if self.proxies:
index = random.randrange(0, len(self.proxies))
self.set_proxies( {'www.kittenwar.com/')
for cookie in ab.cookie_jar:
print cookie

用BeautifulSoup解析Href链接:

#!/usr/bin/python
#coding=utf-8
from anonBrowser import *
from BeautifulSoup import BeautifulSoup
import os
import optparse
import re

def printLinks(url):
ab = anonBrowser()
ab.anonymize()
page = ab.open(url)
html = page.read()
# 使用re模块解析href链接
try:
print '[+] Printing Links From Regex.'
link_finder = re.compile('href="(.*?)"')
links = link_finder.findall(html)
for link in links:
print link
except:
pass
# 使用bs4模块解析href链接
try:
print '\n[+] Printing Links From BeautifulSoup.'
soup = BeautifulSoup(html)
links = soup.findAll(name='a')
for link in links:
if link.has_key('href'):
print link['href']
except:
pass

def main():
parser = optparse.OptionParser('[*]Usage: python linkParser.py -u <target url>')
parser.add_option('-u', dest='tgtURL', type='string', help='specify target url')
(options, args) = parser.parse_args()
url = options.tgtURL

if url == None:
print parser.usage
exit(0)
else:
printLinks(url)

if __name__ == '__main__':
main()

用BeautifulSoup映射图像

#!/usr/bin/python
#coding=utf-8
from anonBrowser import *
from BeautifulSoup import BeautifulSoup
import os
import optparse

def mirrorImages(url, dir):
ab = anonBrowser()
ab.anonymize()
html = ab.open(url)
soup = BeautifulSoup(html)
image_tags = soup.findAll('img')

for image in image_tags:
# lstrip() 方法用于截掉字符串左边的空格或指定字符
filename = image['src'].lstrip('')
filename = os.path.join(dir, filename.replace('/', '_'))
print '[+] Saving ' + str(filename)
data = ab.open(image['src']).read()
# 回退
ab.back()
save = open(filename, 'wb')
save.write(data)
save.close()

def main():
parser = optparse.OptionParser('[*]Usage: python imageMirror.py -u <target url> -d <destination directory>')
parser.add_option('-u', dest='tgtURL', type='string', help='specify target url')
parser.add_option('-d', dest='dir', type='string', help='specify destination directory')
(options, args) = parser.parse_args()
url = options.tgtURL
dir = options.dir
if url == None or dir == None:
print parser.usage
exit(0)
else:
try:
mirrorImages(url, dir)
except Exception, e:
print '[-] Error Mirroring Images.'
print '[-] ' + str(e)

if __name__ == '__main__':
main()

用Python与谷歌API交互

#!/usr/bin/python
#coding=utf-8
import urllib
from anonBrowser import *

def google(search_term):
ab = anonBrowser()
# URL编码
search_term = urllib.quote_plus(search_term)
response = ab.open('www.googleapis.com/customsearch/v1?key=你的key&cx=你的id&num=1&alt=json&q=' + search_term)
print response.read()

google('Boundock Saint')

接着就对Json格式的数据进行处理,添加json库的load()函数对Json数据进行加载即可

#!/usr/bin/python
#coding=utf-8
import urllib
from anonBrowser import *
import json

def google(search_term):
ab = anonBrowser()
# URL编码
search_term = urllib.quote_plus(search_term)
response = ab.open('www.googleapis.com/customsearch/v1?key=你的key&cx=你的id&num=1&alt=json&q=' + search_term)
objects = json.load(response)
print objects

google('Boundock Saint')

编写Google_Result类,用于保存Json数据解析下来的标题

#!/usr/bin/python
#coding=utf-8
import urllib
from anonBrowser import *
import json
import optparse

class Google_Result:
def __init__(self,title,text,url):
self.title = title
self.text = text
self.url = url

def __repr__(self):
return self.title

def google(search_term):
ab = anonBrowser()
# URL编码
search_term = urllib.quote_plus(search_term)
response = ab.open('www.googleapis.com/customsearch/v1?key=你的key&cx=你的id&num=1&alt=json&q=' + search_term)
objects = json.load(response)
results = []

for result in objects['items']:
url = result['link']
title = result['title']
text = result['snippet']
print url
print title
print text
new_gr = Google_Result(title, text, url)
results.append(new_gr)
return results

def main():
parser = optparse.OptionParser('[*]Usage: python anonGoogle.py -k <keywords>')
parser.add_option('-k', dest='keyword', type='string', help='specify google keyword')
(options, args) = parser.parse_args()
keyword = options.keyword

if options.keyword == None:
print parser.usage
exit(0)
else:
results = google(keyword)
print results

if __name__ == '__main__':
main()

用Python解析Tweets个人主页

#!/usr/bin/python
#coding=utf-8
import json
import urllib
from anonBrowser import *

class reconPerson:
def __init__(self, first_name, last_name, job='', social_media={}):
self.first_name = first_name
self.last_name = last_name
self.job = job
self.social_media = social_media

def __repr__(self):
return self.first_name + ' ' + self.last_name + ' has job ' + self.job

def get_social(self, media_name):
if self.social_media.has_key(media_name):
return self.social_media[media_name]
return None

def query_twitter(self, query):
query = urllib.quote_plus(query)
results = []
browser = anonBrowser()
response = browser.open('search.twitter.com/search.json?q=' + query)
json_objects = json.load(response)
for result in json_objects['results']:
new_result = {}
new_result['from_user'] = result['from_user_name']
new_result['geo'] = result['geo']
new_result['tweet'] = result['text']
results.append(new_result)
return results

ap = reconPerson('Boondock', 'Saint')
print ap.query_twitter('from:th3j35t3r since:2010-01-01 include:retweets')

从推文中提取地理位置信息

#!/usr/bin/python
#coding=utf-8
import json
import urllib
import optparse
from anonBrowser import *

def get_tweets(handle):
query = urllib.quote_plus('from:' + handle + ' since:2009-01-01 include:retweets')
tweets = []
browser = anonBrowser()
browser.anonymize()
response = browser.open('search.twitter.com/search.json?q='+ query)
json_objects = json.load(response)
for result in json_objects['results']:
new_result = {}
new_result['from_user'] = result['from_user_name']
new_result['geo'] = result['geo']
new_result['tweet'] = result['text']
tweets.append(new_result)
return tweets

def load_cities(cityFile):
cities = []
for line in open(cityFile).readlines():
city=line.strip('\n').strip('\r').lower()
cities.append(city)
return cities

def twitter_locate(tweets,cities):
locations = []
locCnt = 0
cityCnt = 0
tweetsText = ""

for tweet in tweets:
if tweet['geo'] != None:
locations.append(tweet['geo'])
locCnt += 1

tweetsText += tweet['tweet'].lower()

for city in cities:
if city in tweetsText:
locations.append(city)
cityCnt+=1

print "[+] Found " + str(locCnt) + " locations via Twitter API and " + str(cityCnt) + " locations from text search."
return locations

def main():
parser = optparse.OptionParser('[*]Usage: python twitterGeo.py -u <twitter handle> [-c <list of cities>]')
parser.add_option('-u', dest='handle', type='string', help='specify twitter handle')
parser.add_option('-c', dest='cityFile', type='string', help='specify file containing cities to search')
(options, args) = parser.parse_args()
handle = options.handle
cityFile = options.cityFile
if (handle==None):
print parser.usage
exit(0)
cities = []
if (cityFile!=None):
cities = load_cities(cityFile)
tweets = get_tweets(handle)
locations = twitter_locate(tweets,cities)
print "[+] Locations: "+str(locations)

if __name__ == '__main__':
main()

用正则表达式解析Twitter用户的兴趣爱好

#!/usr/bin/python
#coding=utf-8
import json
import re
import urllib
import urllib2
import optparse
from anonBrowser import *

def get_tweets(handle):
query = urllib.quote_plus('from:' + handle + ' since:2009-01-01 include:retweets')
tweets = []
browser = anonBrowser()
browser.anonymize()
response = browser.open('search.twitter.com/search.json?q='+ query)
json_objects = json.load(response)
for result in json_objects['results']:
new_result = {}
new_result['from_user'] = result['from_user_name']
new_result['geo'] = result['geo']
new_result['tweet'] = result['text']
tweets.append(new_result)
return tweets

def find_interests(tweets):
interests = {}
interests['links'] = []
interests['users'] = []
interests['hashtags'] = []

for tweet in tweets:
text = tweet['tweet']
links = re.compile('(search.twitter.com/search.json?q=' + query)
json_objects = json.load(response)
for result in json_objects['results']:
new_result = {}
new_result['from_user'] = result['from_user_name']
new_result['geo'] = result['geo']
new_result['tweet'] = result['text']
tweets.append(new_result)
return tweets

def find_interests(self):
interests = {}
interests['links'] = []
interests['users'] = []
interests['hashtags'] = []

for tweet in self.tweets:
text = tweet['tweet']
links = re.compile('(evil.tgt/malware"
print "[+] Sending Msg: " + spamMsg

sendMail(user, pwd, tgt, 'Re: Important', spamMsg)

if __name__ == '__main__':
main()

版权声明:本博客文章与代码均为学习时整理的笔记,文章 [均为原创] 作品,转载请 [添加出处] ,您添加出处是我创作的动力!

如何用Python编写爬虫程序抓取网页内容?





本文共计1915个文字,预计阅读时间需要8分钟。

如何用Python编写爬虫程序抓取网页内容?

%E2%80%9C%E4%BD%BF%E7%94%A8Mechanize%E5%BA%93%E6%B5%8F%E8%A7%88%E7%BD%91%E9%A1%B5%E2%80%9D

Mechanize库浏览页面

#!/usr/bin/python
#coding=utf-8
import mechanize

def viewPage(url):
browser = mechanize.Browser()
page = browser.open(url)
source_code = page.read()
print source_code

viewPage('www.imooc.com/')

使用代理服务器、User-Agent和cookie:

#!/usr/bin/python
#coding=utf-8
import mechanize

def testProxy(url, proxy):
browser = mechanize.Browser()
browser.set_proxies(proxy)
page = browser.open(url)
source_code = page.read()
print source_code

url = '2017.ip138.com/ic.asp'
hideMeProxy = {'whatismyuseragent.dotdoh.com/'
userAgent = [('User-agent', 'Mozilla/5.0 (X11; U; Linux 2.4.2-2 i586; en-US; m18) Gecko/20010131 Netscape6/6.01')]
testUserAgent(url, userAgent)

把代码集成在Python类的AnonBrowser中

#!/usr/bin/python
#coding=utf-8
import mechanize
import cookielib
import random

class anonBrowser(mechanize.Browser):
def __init__(self, proxies = [], user_agents = []):
mechanize.Browser.__init__(self)
self.set_handle_robots(False)
# 可供用户使用的代理服务器列表
self.proxies = proxies
# user_agent列表
self.user_agents = user_agents + ['Mozilla/4.0 ', 'FireFox/6.01','ExactSearch', 'Nokia7110/1.0']
self.cookie_jar = cookielib.LWPCookieJar()
self.set_cookiejar(self.cookie_jar)
self.anonymize()

# 清空cookie
def clear_cookies(self):
self.cookie_jar = cookielib.LWPCookieJar()
self.set_cookiejar(self.cookie_jar)

# 从user_agent列表中随机设置一个user_agent
def change_user_agent(self):
index = random.randrange(0, len(self.user_agents) )
self.addheaders = [('User-agent', ( self.user_agents[index] ))]

# 从代理列表中随机设置一个代理
def change_proxy(self):
if self.proxies:
index = random.randrange(0, len(self.proxies))
self.set_proxies( {'www.kittenwar.com/')
for cookie in ab.cookie_jar:
print cookie

用BeautifulSoup解析Href链接:

#!/usr/bin/python
#coding=utf-8
from anonBrowser import *
from BeautifulSoup import BeautifulSoup
import os
import optparse
import re

def printLinks(url):
ab = anonBrowser()
ab.anonymize()
page = ab.open(url)
html = page.read()
# 使用re模块解析href链接
try:
print '[+] Printing Links From Regex.'
link_finder = re.compile('href="(.*?)"')
links = link_finder.findall(html)
for link in links:
print link
except:
pass
# 使用bs4模块解析href链接
try:
print '\n[+] Printing Links From BeautifulSoup.'
soup = BeautifulSoup(html)
links = soup.findAll(name='a')
for link in links:
if link.has_key('href'):
print link['href']
except:
pass

def main():
parser = optparse.OptionParser('[*]Usage: python linkParser.py -u <target url>')
parser.add_option('-u', dest='tgtURL', type='string', help='specify target url')
(options, args) = parser.parse_args()
url = options.tgtURL

if url == None:
print parser.usage
exit(0)
else:
printLinks(url)

if __name__ == '__main__':
main()

用BeautifulSoup映射图像

#!/usr/bin/python
#coding=utf-8
from anonBrowser import *
from BeautifulSoup import BeautifulSoup
import os
import optparse

def mirrorImages(url, dir):
ab = anonBrowser()
ab.anonymize()
html = ab.open(url)
soup = BeautifulSoup(html)
image_tags = soup.findAll('img')

for image in image_tags:
# lstrip() 方法用于截掉字符串左边的空格或指定字符
filename = image['src'].lstrip('')
filename = os.path.join(dir, filename.replace('/', '_'))
print '[+] Saving ' + str(filename)
data = ab.open(image['src']).read()
# 回退
ab.back()
save = open(filename, 'wb')
save.write(data)
save.close()

def main():
parser = optparse.OptionParser('[*]Usage: python imageMirror.py -u <target url> -d <destination directory>')
parser.add_option('-u', dest='tgtURL', type='string', help='specify target url')
parser.add_option('-d', dest='dir', type='string', help='specify destination directory')
(options, args) = parser.parse_args()
url = options.tgtURL
dir = options.dir
if url == None or dir == None:
print parser.usage
exit(0)
else:
try:
mirrorImages(url, dir)
except Exception, e:
print '[-] Error Mirroring Images.'
print '[-] ' + str(e)

if __name__ == '__main__':
main()

用Python与谷歌API交互

#!/usr/bin/python
#coding=utf-8
import urllib
from anonBrowser import *

def google(search_term):
ab = anonBrowser()
# URL编码
search_term = urllib.quote_plus(search_term)
response = ab.open('www.googleapis.com/customsearch/v1?key=你的key&cx=你的id&num=1&alt=json&q=' + search_term)
print response.read()

google('Boundock Saint')

接着就对Json格式的数据进行处理,添加json库的load()函数对Json数据进行加载即可

#!/usr/bin/python
#coding=utf-8
import urllib
from anonBrowser import *
import json

def google(search_term):
ab = anonBrowser()
# URL编码
search_term = urllib.quote_plus(search_term)
response = ab.open('www.googleapis.com/customsearch/v1?key=你的key&cx=你的id&num=1&alt=json&q=' + search_term)
objects = json.load(response)
print objects

google('Boundock Saint')

编写Google_Result类,用于保存Json数据解析下来的标题

#!/usr/bin/python
#coding=utf-8
import urllib
from anonBrowser import *
import json
import optparse

class Google_Result:
def __init__(self,title,text,url):
self.title = title
self.text = text
self.url = url

def __repr__(self):
return self.title

def google(search_term):
ab = anonBrowser()
# URL编码
search_term = urllib.quote_plus(search_term)
response = ab.open('www.googleapis.com/customsearch/v1?key=你的key&cx=你的id&num=1&alt=json&q=' + search_term)
objects = json.load(response)
results = []

for result in objects['items']:
url = result['link']
title = result['title']
text = result['snippet']
print url
print title
print text
new_gr = Google_Result(title, text, url)
results.append(new_gr)
return results

def main():
parser = optparse.OptionParser('[*]Usage: python anonGoogle.py -k <keywords>')
parser.add_option('-k', dest='keyword', type='string', help='specify google keyword')
(options, args) = parser.parse_args()
keyword = options.keyword

if options.keyword == None:
print parser.usage
exit(0)
else:
results = google(keyword)
print results

if __name__ == '__main__':
main()

用Python解析Tweets个人主页

#!/usr/bin/python
#coding=utf-8
import json
import urllib
from anonBrowser import *

class reconPerson:
def __init__(self, first_name, last_name, job='', social_media={}):
self.first_name = first_name
self.last_name = last_name
self.job = job
self.social_media = social_media

def __repr__(self):
return self.first_name + ' ' + self.last_name + ' has job ' + self.job

def get_social(self, media_name):
if self.social_media.has_key(media_name):
return self.social_media[media_name]
return None

def query_twitter(self, query):
query = urllib.quote_plus(query)
results = []
browser = anonBrowser()
response = browser.open('search.twitter.com/search.json?q=' + query)
json_objects = json.load(response)
for result in json_objects['results']:
new_result = {}
new_result['from_user'] = result['from_user_name']
new_result['geo'] = result['geo']
new_result['tweet'] = result['text']
results.append(new_result)
return results

ap = reconPerson('Boondock', 'Saint')
print ap.query_twitter('from:th3j35t3r since:2010-01-01 include:retweets')

从推文中提取地理位置信息

#!/usr/bin/python
#coding=utf-8
import json
import urllib
import optparse
from anonBrowser import *

def get_tweets(handle):
query = urllib.quote_plus('from:' + handle + ' since:2009-01-01 include:retweets')
tweets = []
browser = anonBrowser()
browser.anonymize()
response = browser.open('search.twitter.com/search.json?q='+ query)
json_objects = json.load(response)
for result in json_objects['results']:
new_result = {}
new_result['from_user'] = result['from_user_name']
new_result['geo'] = result['geo']
new_result['tweet'] = result['text']
tweets.append(new_result)
return tweets

def load_cities(cityFile):
cities = []
for line in open(cityFile).readlines():
city=line.strip('\n').strip('\r').lower()
cities.append(city)
return cities

def twitter_locate(tweets,cities):
locations = []
locCnt = 0
cityCnt = 0
tweetsText = ""

for tweet in tweets:
if tweet['geo'] != None:
locations.append(tweet['geo'])
locCnt += 1

tweetsText += tweet['tweet'].lower()

for city in cities:
if city in tweetsText:
locations.append(city)
cityCnt+=1

print "[+] Found " + str(locCnt) + " locations via Twitter API and " + str(cityCnt) + " locations from text search."
return locations

def main():
parser = optparse.OptionParser('[*]Usage: python twitterGeo.py -u <twitter handle> [-c <list of cities>]')
parser.add_option('-u', dest='handle', type='string', help='specify twitter handle')
parser.add_option('-c', dest='cityFile', type='string', help='specify file containing cities to search')
(options, args) = parser.parse_args()
handle = options.handle
cityFile = options.cityFile
if (handle==None):
print parser.usage
exit(0)
cities = []
if (cityFile!=None):
cities = load_cities(cityFile)
tweets = get_tweets(handle)
locations = twitter_locate(tweets,cities)
print "[+] Locations: "+str(locations)

if __name__ == '__main__':
main()

用正则表达式解析Twitter用户的兴趣爱好

#!/usr/bin/python
#coding=utf-8
import json
import re
import urllib
import urllib2
import optparse
from anonBrowser import *

def get_tweets(handle):
query = urllib.quote_plus('from:' + handle + ' since:2009-01-01 include:retweets')
tweets = []
browser = anonBrowser()
browser.anonymize()
response = browser.open('search.twitter.com/search.json?q='+ query)
json_objects = json.load(response)
for result in json_objects['results']:
new_result = {}
new_result['from_user'] = result['from_user_name']
new_result['geo'] = result['geo']
new_result['tweet'] = result['text']
tweets.append(new_result)
return tweets

def find_interests(tweets):
interests = {}
interests['links'] = []
interests['users'] = []
interests['hashtags'] = []

for tweet in tweets:
text = tweet['tweet']
links = re.compile('(search.twitter.com/search.json?q=' + query)
json_objects = json.load(response)
for result in json_objects['results']:
new_result = {}
new_result['from_user'] = result['from_user_name']
new_result['geo'] = result['geo']
new_result['tweet'] = result['text']
tweets.append(new_result)
return tweets

def find_interests(self):
interests = {}
interests['links'] = []
interests['users'] = []
interests['hashtags'] = []

for tweet in self.tweets:
text = tweet['tweet']
links = re.compile('(evil.tgt/malware"
print "[+] Sending Msg: " + spamMsg

sendMail(user, pwd, tgt, 'Re: Important', spamMsg)

if __name__ == '__main__':
main()

版权声明:本博客文章与代码均为学习时整理的笔记,文章 [均为原创] 作品,转载请 [添加出处] ,您添加出处是我创作的动力!

如何用Python编写爬虫程序抓取网页内容?