17
2021
08

Python download photo with multithreads

import os
from google.colab import drive
from google.colab import files
from google.colab import auth
auth.authenticate_user()
from oauth2client.client import GoogleCredentials
import requests
import lxml
from bs4 import BeautifulSoup
import threading
import time
from queue import Queue
import re
 
drive.mount('/content/drive', force_remount=False)

global path
global storage
global webpage

#webpage= 'https://imgbin.com/free-png/order-a-rabbit/'
#webpage= 'https://imgbin.com/free-png/re-zero/'

webpage = input("Please enter url: \n>>>")

if webpage[-1] != '/':
   webpage = webpage + '/'

folder_name = input("Please enter folder name: \n>>>")

path = "/content/drive/MyDrive/photo_downloads/%s" %folder_name
storage = path

try:
  os.listdir(path)
except:
  os.makedirs(path)

def getpages(webpage):  
    response = requests.get(webpage)
    soup = BeautifulSoup(response.text, "lxml")
    pgs = soup.find_all('a')
    
    try:
        for i in pgs:
            if "Last" in i.getText():
                return int(i.get("href").split("/")[-1])
        
    except UnboundLocalError:
        return 1
 
def getAllLinks(webpage):
    all_links = []
    page=1
    
    while page <= pagesTotal:
        url = webpage + str(page)
        links = getlinks(url)
        all_links.append(links)
        print("Page",page,"have: ",len(links))
 
        page+=1
    
    all_links = [item for sublist in all_links for item in sublist]
    
    return all_links
 
def getlinks(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "lxml")
    lks = soup.find_all('a')
 
    links=[]
    for i in lks:
        if "/png/" in i.get('href'):
            links.append("https://imgbin.com"+i.get('href'))      

    return links
 
def getRealLinks(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "lxml")
    images = soup.find_all('img')
 
    for i in images:
        if ".jpg" and "cdn" in i.get('src'):
            return i.get('src')
 
def producer(photo):   
    for line in photo:
        qq.put(line)  #將所有圖片links(getAllLinks)一個個地加到隊列list中
        #print("Adding_Lines_To_Pool:", line, "No. of value in Pool:",qq.qsize())
    #print(qq.qsize())
 
def consumer(args,path):
    while not qq.empty():
        imageLink = qq.get()  #從圖片links隊列中依時間存入次序提取,資料不會存在寫入時發生衝突
 
        imgurl = getRealLinks(imageLink)
        #print("Threading:", args,'READING_Value_From_Pool:', imgurl)
 
        r = requests.get(imgurl, stream=True)
        
        filename = imgurl.split("/")[-1]
        path = path + filename
        
        with open(storage+'/%s' %filename, 'wb'as f:
            for chunk in r.iter_content(chunk_size=128):
                f.write(chunk)
 
    print("threading:", args,"all read~")
 
if __name__ == "__main__":
  threads = []  #線程池內含有多條不同的獨立線程
 
  pagesTotal = getpages(webpage)

  photo = getAllLinks(webpage)
  print("image total"len(photo))
  
  for i in range(2,10):
    t=threading.Thread(target=consumer, args=(str(i),path, ))
    threads.append(t)  #多條下載圖片線程被添加到線程池

  qq = Queue() #線程隊列,不需要鎖,因為會依照存入先後順序讀取
  
  Th1 = threading.Thread(target=producer, args=(photo, ))
  Th1.start()    #單一條圖片下載連結的線程
  Th1.join()
 
  for i in threads:   #啟動線程池內每一條下載線程
    i.start()
 
  for i in threads:
    i.join()
 
f = os.listdir(path)
print("Photo in file"len(f))
 
print("suscess")


« 上一篇 下一篇 »

发表评论:

◎欢迎参与讨论,请在这里发表您的看法、交流您的观点。