import os
from google.colab import drive
from google.colab import files
from google.colab import auth
auth.authenticate_user()
from oauth2client.client import GoogleCredentials
import requests
import lxml
from bs4 import BeautifulSoup
import threading
import time
from queue import Queue
import re
drive.mount('/content/drive', force_remount=False)
global path
global storage
global webpage
#webpage= 'https://imgbin.com/free-png/order-a-rabbit/'
#webpage= 'https://imgbin.com/free-png/re-zero/'
webpage = input("Please enter url: \n>>>")
if webpage[-1] != '/':
webpage = webpage + '/'
folder_name = input("Please enter folder name: \n>>>")
path = "/content/drive/MyDrive/photo_downloads/%s" %folder_name
storage = path
try:
os.listdir(path)
except:
os.makedirs(path)
def getpages(webpage):
response = requests.get(webpage)
soup = BeautifulSoup(response.text, "lxml")
pgs = soup.find_all('a')
try:
for i in pgs:
if "Last" in i.getText():
return int(i.get("href").split("/")[-1])
except UnboundLocalError:
return 1
def getAllLinks(webpage):
all_links = []
page=1
while page <= pagesTotal:
url = webpage + str(page)
links = getlinks(url)
all_links.append(links)
print("Page",page,"have: ",len(links))
page+=1
all_links = [item for sublist in all_links for item in sublist]
return all_links
def getlinks(url):
response = requests.get(url)
soup = BeautifulSoup(response.text, "lxml")
lks = soup.find_all('a')
links=[]
for i in lks:
if "/png/" in i.get('href'):
links.append("https://imgbin.com"+i.get('href'))
return links
def getRealLinks(url):
response = requests.get(url)
soup = BeautifulSoup(response.text, "lxml")
images = soup.find_all('img')
for i in images:
if ".jpg" and "cdn" in i.get('src'):
return i.get('src')
def producer(photo):
for line in photo:
qq.put(line) #將所有圖片links(getAllLinks)一個個地加到隊列list中
#print("Adding_Lines_To_Pool:", line, "No. of value in Pool:",qq.qsize())
#print(qq.qsize())
def consumer(args,path):
while not qq.empty():
imageLink = qq.get() #從圖片links隊列中依時間存入次序提取,資料不會存在寫入時發生衝突
imgurl = getRealLinks(imageLink)
#print("Threading:", args,'READING_Value_From_Pool:', imgurl)
r = requests.get(imgurl, stream=True)
filename = imgurl.split("/")[-1]
path = path + filename
with open(storage+'/%s' %filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=128):
f.write(chunk)
print("threading:", args,"all read~")
if __name__ == "__main__":
threads = [] #線程池內含有多條不同的獨立線程
pagesTotal = getpages(webpage)
photo = getAllLinks(webpage)
print("image total", len(photo))
for i in range(2,10):
t=threading.Thread(target=consumer, args=(str(i),path, ))
threads.append(t) #多條下載圖片線程被添加到線程池
qq = Queue() #線程隊列,不需要鎖,因為會依照存入先後順序讀取
Th1 = threading.Thread(target=producer, args=(photo, ))
Th1.start() #單一條圖片下載連結的線程
Th1.join()
for i in threads: #啟動線程池內每一條下載線程
i.start()
for i in threads:
i.join()
f = os.listdir(path)
print("Photo in file", len(f))
print("suscess")