27
2022
03

Rewrite for download imgbin photo (27/3/2022)

import os
import requests
import lxml
from bs4 import BeautifulSoup
import tqdm
import multiprocessing
import time
import re

from google.colab import drive
drive.mount('/content/drive', force_remount=False)

class Imgbin:

  def __init__(selfpathurl):
    self.path = path
    self.url = url
    self.headers = self.headers = {"user-agent" : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"}

  def get_photo_infromation(self):
    information = {}
    all_links = []

    response = requests.get(self.url, headers=self.headers)
    soup = BeautifulSoup(response.text, "lxml")

    information['title'] = soup.title.getText().split("PNG Images")[0]
    
    total_pages = 1
    try:
      for i in soup.find_all('a'):
        if "Last" in i.getText():
          total_pages = int(i.get("href").split("/")[-1])
          break
    except:
      pass

    for i in range(total_pages):
      images = soup.find_all('a')
      
      for image in images:
        if "/png/" in image.get('href'):
          all_links.append("https://imgbin.com" + image.get('href'))
      
      newUrl = re.sub("\/$|\/\d+$"""self.url) + f"/{i+1}"

      response = requests.get(newUrl, headers=self.headers)
      soup = BeautifulSoup(response.text, "lxml")      
    
    information['links'] = list(dict.fromkeys(all_links))
    
    return information
 
  def getRealLinks(selfurl):
    response = requests.get(url, headers=self.headers)
    soup = BeautifulSoup(response.text, "lxml")
    images = soup.find_all('img')
  
    for i in images:
      if ".jpg" and "cdn" in i.get('src'):
        return i.get('src')
 
  def download(selfqueueindexlinkfolder_path, ):
    imgurl = self.getRealLinks(link)

    r = requests.get(imgurl, headers=self.headers, stream=True)

    filename = index + "---" + imgurl.split("/")[-1]

    with open(os.path.join(folder_path, filename), 'wb'as f:
      for chunk in r.iter_content(chunk_size=128):
        f.write(chunk)

    queue.put("done")

  def _process(self):
    
    information = self.get_photo_infromation()

    folder_path = os.path.join(self.path, information['title'])

    try:
      os.listdir(folder_path)
    except:
      os.makedirs(folder_path)
    
    total = len(information['links'])
    print("image total", total)

    pool = multiprocessing.Pool(10)
    queue = multiprocessing.Manager().Queue()
    
    i = 1
    for link in information['links']:
      index = str(i).zfill(3)
      pool.apply_async(self.download, args=(queue, index, link, folder_path, ))
      i+=1

    pool.close()  
    for i in tqdm.tqdm(range(total)):
      data = queue.get()
 
    print("suscess")

if __name__ == "__main__":
  path = "/content/drive/MyDrive/"
  url = input("Please enter url: \n>>>"#webpage= 'https://imgbin.com/free-png/order-a-rabbit/' #webpage= 'https://imgbin.com/free-png/re-zero/'

  Imgbin(path, url)._process()


« 上一篇 下一篇 »

发表评论:

◎欢迎参与讨论,请在这里发表您的看法、交流您的观点。