下面是一个python小程序,用来获取对应网址上的图片,并保存到本地D://imgs_from_yinwoods/目录下(linux系统会在当前目录下生成对应文件夹

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
#coding=utf-8
import re
import os
import urllib
import urllib.request
def getHtml(url): #用来获取给定url对应页面源码
page = urllib.request.urlopen(url)
html = page.read()
return html
def getImg(htmls): #获取htmls集合中含有的图片
print("I'm trying to get sexy images......")
print("The images I'll get will be placed in dir D://imgs_from_yinwoods")
reg = r'src="(.+?\.jpg)"' #正则表达式匹配图片url
imgre = re.compile(reg)
x = 0
cnt = 0
imglist = []
for cnt in range(0, 4): #存取4个页面集合的图片url地址
imglist.append(re.findall(imgre, str(htmls[cnt].decode('utf-8'))))
path = os.path.join("D:/", "imgs_from_yinwoods") #设置文件存放文件夹
os.makedirs(path) #创建文件夹
for imgurl in imglist:
for url in imgurl:
print(url)
url = "http://hiowner.com/"+url
print(url)
urllib.request.urlretrieve(url, 'D://imgs_from_yinwoods/%s.jpg' % x)
x += 1
print("I've got %d images" % x)
if x > 99:
return
html1 = getHtml("http://hiowner.com/users")
html2 = getHtml("http://hiowner.com/users/page/2")
html3 = getHtml("http://hiowner.com/users/page/3")
html4 = getHtml("http://hiowner.com/users/page/4")
htmls = []
htmls.append(html1)
htmls.append(html2)
htmls.append(html3)
htmls.append(html4)
getImg(htmls)

最近重新看这份代码,发现写的非常不好,而且试了下运行的话会报403错误。原因是该网站不允许爬虫直接读取页面内容。

那我们所要做的就是给爬虫穿上伪装的外衣就好了。在尝试读取网页内容的时候为头信息添加User-Agent即可。

还要注意的是这里不能用urllib.request.urlretrieve(),原因是无法赋予User-Agent信息,所以我采用了读取文件源码再写到本地的方法。

另外我又加了多线程机制提高爬取效率。

代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#coding=utf-8
import re
import os
import time
import threading
import urllib
import urllib.request
from bs4 import BeautifulSoup
class MM_Crawler:
def getHtmlContents(self, url): #用来获取给定url对应页面源码
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/48.0.2564.116 Chrome/48.0.2564.116 Safari/537.36',
}
headers = dict(headers)
req = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(req)
html = response.read()
return html
def getImg(self, html, id): #获取htmls集合中含有的图片
print("I'm trying to get sexy images......")
print("The images I'll get will be placed in dir D://imgs_from_yinwoods")
html_Doc = BeautifulSoup(html)
divLists = html_Doc.find_all("div", "panel-body")
urllib.request.urlretrieve()
imgNum = 0
dirName = "page-" + str(id)
if os.path.exists(dirName):
print("文件夹已存在!")
else:
os.mkdir(dirName)
for divItem in divLists:
link = divItem.find('a')
if link != None:
imgLink = "http://hiowner.com/" + link.find('img')['src']
imgNum += 1
img = self.getHtmlContents(imgLink)
regx = re.compile('(\/page\/\d)')
fileName = dirName + '/%s.jpg' % imgNum
with open(fileName, 'wb') as file:
if os.path.isfile(fileName):
print(fileName + " 已存在!")
else:
file.write(img)
mmcrawler = MM_Crawler()
def tsk1(pageNum):
url = "http://hiowner.com/users/page/" + str(pageNum)
mmcrawler.getImg(mmcrawler.getHtmlContents(url).decode('utf8'), pageNum)
time.sleep(1)
def createMultiThread(threadNum):
threads = []
for num in range(threadNum):
threads.append(threading.Thread(target=tsk1, args=str(num+1)))
for t in threads:
t.start()
def main():
createMultiThread(5)
if __name__ == "__main__":
main()