13
2022
02

multiprocessing for 免費小說閱讀網

# coding=utf-8
import os
import shutil
import time
import math
import random
import re

import requests
import lxml
from bs4 import BeautifulSoup

import multiprocessing
import tqdm
from IPython.display import clear_output

from google.colab import drive
drive.mount('/content/drive', force_remount=False)
#=======================================================================================================================================================================================================================================================
class file_organization:

  def create_folder(selfnumberpath):
    folder_list = []
    for i in range(1,len(str(number))):
      divide_part = math.pow(10,i)
      
      folder_number = math.ceil(number/divide_part) -1  
      
      for j in range(folder_number):
        folder_path = os.path.join(path, "第%d至%d章"%(j*divide_part+1,(j+1)*divide_part))
        folder_list.append(folder_path)

      folder_path = os.path.join(path,"第%d至%d章"%(folder_number*divide_part+1,number))
      folder_list.append(folder_path)

    return list(dict.fromkeys(folder_list))

  def moving_txt_files(selfnumberpath):
    destination = []
    if number <= 10:
      return
    
    folder_number = math.ceil(number/10

    for i in range(folder_number):
      for j in range(10):
        if i*10+j+1 > number:
          break

        old = os.path.join(path,"第%d章.txt"%(i*10+j+1))

        if (i+1)*10 > number:
          new = os.path.join(path,"第%d至%d章"%(i*10+1,number), "第%d章.txt"%(i*10+j+1))
        else:
          new = os.path.join(path,"第%d至%d章"%(i*10+1,(i+1)*10), "第%d章.txt"%(i*10+j+1))

        destination.append([old, new])
        
    return destination

  def moving_folder_files(selfnumberpath):
    destination = []
    
    for i in range(1,len(str(number))):
      divided_part = math.pow(10,i)
      
      folder_number = math.ceil(number/divided_part)
      if folder_number <= 10:
        continue

      for j in range(folder_number):
        x1 = j*divided_part+1
        x2 = (j+1)*divided_part
        x3 = 10*divided_part*math.floor((j*divided_part+1)/(10*divided_part))+1
        x4 = x3 + 10*divided_part - 1

        if x2 > number:
          x2 = number
        if x4 > number:
          x4 = number
        if x1 == x3 and x2 == x4:
          continue
        
        old = os.path.join(path,"第%d至%d章"%(x1,x2))
        new = new = os.path.join(path,"第%d至%d章"%(x3,x4))
        
        destination.append([old, new])
    
    return destination

  def execute_(selfpath):
    number = len(os.listdir(path))
    print(number)
    
    if number <=10:
      return

    for folder in self.create_folder(number,path):
      try:
        os.makedirs(folder)
      except Exception as e:
        print(e)
    
    print("folder created")

    for txt_path in self.moving_txt_files(number,path):
      try:
        shutil.move(txt_path[0], txt_path[1])
      except Exception as e:
        print(e)
    
    print("txt files moved into folder")

    for folder_path in self.moving_folder_files(number,path):
      try:
        shutil.move(folder_path[0], folder_path[1])
      except Exception as e:
        print(e)

    print("All complete")
#=======================================================================================================================================================================================================================================================
class Novel:

  def __init__(selfurl):
    self.url = url
    self.headers = {"User-Agent"'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'}

  def create_path(selfnovel_title):
    path = "/content/drive/MyDrive/小說/%s/"%(novel_title)
    
    try:
      os.listdir(path)
    except:
      os.makedirs(path)
    
    return path

  def base64(self, _Str):
    staticchars = "PXhw7UT1B0a9kQDKZsjIASmOezxYG4CHo5Jyfg2b8FLpEvRr3WtVnlqMidu6cN"
    encodechars = ""

    for i in range(len(_Str)): 
      try:
        num0 = staticchars.index(_Str[i])
      except:
        num0 = -1

      if num0 == -1:
        code = _Str[i]  
      else:
        code = staticchars[(num0+3)%62]
      
      num1 = int(math.floor(random.random()*62))
      num2 = int(math.floor(random.random()*62))
      encodechars += (staticchars[num1] + code + staticchars[num2])
 
    return encodechars

  def getAll_download_information(self):
    download_information = {}
    response = requests.get(self.url, headers = self.headers)

    soup = BeautifulSoup(response.text, "lxml")
    download_information['novel_title'] = "_".join(soup.find("h1").getText().split(" ")) 

    html_list = list(map(lambda x: "https://big5.mfxsydw.com" + x.find("a").get("href"), soup.find_all("li")))
    id = self.url.split("/")[-1].split(".html")[0]
    callback = re.findall("callback=(?:\'|\")(.*?)(?:\'|\")", response.text, re.MULTILINE).pop() 

    jsonUrl = "https://big5.mfxsydw.com/index.php?c=book&a=show.jsonp&callback={}&book_id={}&b={}".format(callback, id, self.base64(callback))

    json_headers = {"User-Agent""Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36",
             "Referer"self.url}

    r = requests.get(jsonUrl, headers = json_headers)
    json_list = list(map(lambda x: "https://big5.mfxsydw.com" + x, re.findall("<a\s+href=(?:\"|\')(.*?)(?:\'|\")""".join(r.text.split("\\"))))) 
    
    download_list = sorted((html_list + json_list), key=lambda x: int(re.search("\/(\d+)\.html", x).group(1)))
    download_information['download_list'] = list(dict.fromkeys(download_list))

    return download_information

  def run_command(self):
    dowload_information = self.getAll_download_information()
    path = self.create_path(dowload_information['novel_title'])

    pool = multiprocessing.Pool(10)
    queue = multiprocessing.Manager().Queue()

    number = 1
    for url in dowload_information['download_list']:
      pool.apply_async(self.execute_download, args = (queue, url, number, path))
      number += 1
    
    pool.close()
    self.progressBar(queue, dowload_information['download_list'])

    return path

  def execute_download(selfqueuehrefnumberpath):
    r = requests.get(href, headers = self.headers)
    soup = BeautifulSoup(r.text, "lxml")

    novel_header = soup.find("h1").getText()
    content = list(map(lambda x: x.getText(), soup.find_all("p")))
    novel_content = "\n\n".join(list(filter(lambda x: x!="上一頁" and x!= "目錄" and x!= "下一頁" and x!="免費小說閱讀網", content)))

    if len(novel_content) < 40:
      print(f"\n第{number}章: {novel_header} \nLess than 40 words")

    with open(os.path.join(path, f"第{number}章.txt"), "w"as file:
      file.write(novel_header + "\n\n" + novel_content)
      file.flush()

    queue.put(url)

  def progressBar(selfqueuedownload_list):    
    for i in tqdm.tqdm(range(len(download_list))):
      file = queue.get()
#====================================================================================================================================================================================================================================
if __name__ == "__main__":
  url = "https://big5.mfxsydw.com/book/5939.html"

  novel_path = Novel(url).run_command()

  file_organization().execute_(novel_path)


« 上一篇 下一篇 »

发表评论:

◎欢迎参与讨论,请在这里发表您的看法、交流您的观点。