ITHOME-Multi-process for Imgbin Photo (可以做教材?)

30

2022
09

Multi-process for Imgbin Photo (可以做教材?)

import os
import requests
import lxml
from bs4 import BeautifulSoup
import tqdm
import multiprocessing
import re

from google.colab import drive
drive.mount('/content/drive', force_remount=False)

class Imgbin:

  def __init__(self, path, url):
    self.path = path
    self.url = re.sub("\/$|\/\d+$", "", url) + "/"
    self.headers = {"user-agent" : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"}

  def get_photo_infromation(self):
    information = {}
    all_links = []

    response = requests.get(self.url, headers=self.headers)
    soup = BeautifulSoup(response.text, "lxml")

    information['title'] = soup.title.getText().split("PNG Images")[0]
    
    total_pages = 1
    try:
      for i in soup.find_all('a'):
        if "Last" in i.getText():
          total_pages = int(i.get("href").split("/")[-1])
          break
    except:
      pass

    for i in range(total_pages):
      images = soup.find_all('a')
      
      for image in images:
        if "/png/" in image.get('href'):
          all_links.append("https://imgbin.com" + image.get('href'))
      
      newUrl = re.sub("\/$|\/\d+$", "", self.url) + f"/{i+1}"

      response = requests.get(newUrl, headers=self.headers)
      soup = BeautifulSoup(response.text, "lxml")      
    
    information['links'] = list(dict.fromkeys(all_links))
    
    return information
 
  def getRealLinks(self, url):
    response = requests.get(url, headers=self.headers)
    soup = BeautifulSoup(response.text, "lxml")
    images = soup.find_all('img')
  
    for i in images:
      if ".jpg" and "cdn" in i.get('src'):
        return i.get('src')
 
  def download(self, queue, index, link, folder_path, ):
    imgurl = self.getRealLinks(link)

    r = requests.get(imgurl, headers=self.headers, stream=True)

    filename = index + "---" + imgurl.split("/")[-1]

    with open(os.path.join(folder_path, filename), 'wb') as f:
      for chunk in r.iter_content(chunk_size=128):
        f.write(chunk)

    queue.put({index: imgurl})

  def _multiprocess(self):
    
    information = self.get_photo_infromation()

    folder_path = os.path.join(self.path, information['title'])

    try:
      os.listdir(folder_path)
    except:
      os.makedirs(folder_path)

    pool = multiprocessing.Pool(8)
    queue = multiprocessing.Manager().Queue()
    
    i = 1
    for link in information['links']:
      index = str(i).zfill(3)
      pool.apply_async(self.download, args=(queue, index, link, folder_path, ))
      i+=1

    pool.close()

    returnData = []  
    for i in tqdm.tqdm(information['links']):
      returnData.append(queue.get())
    
    return returnData

if __name__ == "__main__":
  path = "/content/drive/MyDrive/photo_downloads/"

  if not os.path.exists(path):
    os.makedirs(path)

  url = input("Please enter url: \n>>>") #webpage= 'https://imgbin.com/free-png/order-a-rabbit/', 'https://imgbin.com/free-png/re-zero/'

  information = Imgbin(path, url)._multiprocess()

  print("success")
作者:john | 分类:Programme | 浏览:19 | 评论:0
发表评论:

◎欢迎参与讨论，请在这里发表您的看法、交流您的观点。
ITHOME

Hello, Guys! It's Me

30

202209

Multi-process for Imgbin Photo (可以做教材?)

发表评论:

2022
09