import urllib
import threading
from bs4 import BeautifulSoup
import requests
import os
import time
import lxml
# 頁面鏈接的初始化列表
page_links_list=['https://www.google.com.hk/search?q=emoji&hl=zh-HK&gbv=2&biw=1263&bih=625&tbm=isch&ei=sLncXv3pE5fbhwOk77eAAw&start=20&sa=N']
# 圖片鏈接列表
img_links_list = []
#獲取爬取的頁數和頁面鏈接
def GetUrls(page_links_list):
pages = int(input('請輸入你想爬取的頁數:20的倍数'))
if pages >20:
for page in range(40,200,20):
url = 'https://www.google.com.hk/search?q=emoji&hl=zh-HK&gbv=2&biw=1263&bih=625&tbm=isch&ei=sLncXv3pE5fbhwOk77eAAw'+'&start='+str(page)+"&sa=N"
page_links_list.append(url)
else:
page_links_list=page_links_list
#初始化鎖,創建一把鎖
gLock=threading.Lock()
#生產者,負責從每個頁面中獲取圖片的鏈接
class Producer(threading.Thread):
def run(self):
while len(page_links_list)>0:
#上
gLock.acquire()
#默認取出列表中的最後一個元素
page_url=page_links_list.pop()
#釋放鎖
gLock.release()
#獲取img標籤
#html = requests.get(page_url).content.decode('utf-8')
html = requests.get(page_url).text
soup = BeautifulSoup(html, 'lxml')
img_d = soup.find_all('div', {"class": "RAyV4b"})
#加鎖3
gLock.acquire()
for d in img_d:
imgs = d.find_all('img')
for img in imgs:
photo_url = img['src']
img_links_list.append(photo_url)
#釋放鎖
gLock.release()
#print(len(img_links_list))print
#消費者,負責從獲取的圖片鏈接中下載圖片
class Consumer(threading.Thread,):
def run(self):
print("%s is running"%threading.current_thread())
while True:
#print(len(img_links_list))
#上鎖
gLock.acquire()
if len(img_links_list)==0:
#不管什麼情況,都要釋放鎖
gLock.release()
continue
else:
img_url=img_links_list.pop()
#print(img_links_list)
gLock.release()
r = requests.get(img_url, stream=True)
filename = img_url[-8:]+".jpg"
#file_name = url_of_file.split('/')[-1]
print('正在下載:', filename)
path = './img/'+filename
with open('./img/%s' % filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=128):
f.write(chunk)
#urllib.request.urlretrieve(img_url, filename=path)
if len(img_links_list)==0:
end=time.time()
print("消耗的時間為:", (end - start))
exit()
if __name__ == '__main__':
GetUrls(page_links_list)
#os.mkdir('./img')
if os.path.exists('./img') is True:
pass
else:
os.makedirs('./img')
start=time.time()
# 20個生產者線程,去從頁面中爬取圖片鏈接
for x in range(20):
Producer().start()
# 30個消費者線程,去從中提取下載鏈接,然後下載
for x in range(30):
Consumer().start()