import os
import requests
import lxml
from bs4 import BeautifulSoup
import tqdm
import multiprocessing
import re
from google.colab import drive
drive.mount('/content/drive', force_remount=False)
class Imgbin:
def __init__(self, path, url):
self.path = path
self.url = re.sub("\/$|\/\d+$", "", url) + "/"
self.headers = {"user-agent" : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"}
def get_photo_infromation(self):
information = {}
all_links = []
response = requests.get(self.url, headers=self.headers)
soup = BeautifulSoup(response.text, "lxml")
information['title'] = soup.title.getText().split("PNG Images")[0]
total_pages = 1
try:
for i in soup.find_all('a'):
if "Last" in i.getText():
total_pages = int(i.get("href").split("/")[-1])
break
except:
pass
for i in range(total_pages):
images = soup.find_all('a')
for image in images:
if "/png/" in image.get('href'):
all_links.append("https://imgbin.com" + image.get('href'))
newUrl = re.sub("\/$|\/\d+$", "", self.url) + f"/{i+1}"
response = requests.get(newUrl, headers=self.headers)
soup = BeautifulSoup(response.text, "lxml")
information['links'] = list(dict.fromkeys(all_links))
return information
def getRealLinks(self, url):
response = requests.get(url, headers=self.headers)
soup = BeautifulSoup(response.text, "lxml")
images = soup.find_all('img')
for i in images:
if ".jpg" and "cdn" in i.get('src'):
return i.get('src')
def download(self, queue, index, link, folder_path, ):
imgurl = self.getRealLinks(link)
r = requests.get(imgurl, headers=self.headers, stream=True)
filename = index + "---" + imgurl.split("/")[-1]
with open(os.path.join(folder_path, filename), 'wb') as f:
for chunk in r.iter_content(chunk_size=128):
f.write(chunk)
queue.put({index: imgurl})
def _multiprocess(self):
information = self.get_photo_infromation()
folder_path = os.path.join(self.path, information['title'])
try:
os.listdir(folder_path)
except:
os.makedirs(folder_path)
pool = multiprocessing.Pool(8)
queue = multiprocessing.Manager().Queue()
i = 1
for link in information['links']:
index = str(i).zfill(3)
pool.apply_async(self.download, args=(queue, index, link, folder_path, ))
i+=1
pool.close()
returnData = []
for i in tqdm.tqdm(information['links']):
returnData.append(queue.get())
return returnData
if __name__ == "__main__":
path = "/content/drive/MyDrive/photo_downloads/"
if not os.path.exists(path):
os.makedirs(path)
url = input("Please enter url: \n>>>") #webpage= 'https://imgbin.com/free-png/order-a-rabbit/', 'https://imgbin.com/free-png/re-zero/'
information = Imgbin(path, url)._multiprocess()
print("success")