# coding=utf-8
import os
import shutil
import time
import math
import random
import re
import requests
import lxml
from bs4 import BeautifulSoup
import multiprocessing
import tqdm
from IPython.display import clear_output
from google.colab import drive
drive.mount('/content/drive', force_remount=False)
#=======================================================================================================================================================================================================================================================
class file_organization:
def create_folder(self, number, path):
folder_list = []
for i in range(1,len(str(number))):
divide_part = math.pow(10,i)
folder_number = math.ceil(number/divide_part) -1
for j in range(folder_number):
folder_path = os.path.join(path, "第%d至%d章"%(j*divide_part+1,(j+1)*divide_part))
folder_list.append(folder_path)
folder_path = os.path.join(path,"第%d至%d章"%(folder_number*divide_part+1,number))
folder_list.append(folder_path)
return list(dict.fromkeys(folder_list))
def moving_txt_files(self, number, path):
destination = []
if number <= 10:
return
folder_number = math.ceil(number/10)
for i in range(folder_number):
for j in range(10):
if i*10+j+1 > number:
break
old = os.path.join(path,"第%d章.txt"%(i*10+j+1))
if (i+1)*10 > number:
new = os.path.join(path,"第%d至%d章"%(i*10+1,number), "第%d章.txt"%(i*10+j+1))
else:
new = os.path.join(path,"第%d至%d章"%(i*10+1,(i+1)*10), "第%d章.txt"%(i*10+j+1))
destination.append([old, new])
return destination
def moving_folder_files(self, number, path):
destination = []
for i in range(1,len(str(number))):
divided_part = math.pow(10,i)
folder_number = math.ceil(number/divided_part)
if folder_number <= 10:
continue
for j in range(folder_number):
x1 = j*divided_part+1
x2 = (j+1)*divided_part
x3 = 10*divided_part*math.floor((j*divided_part+1)/(10*divided_part))+1
x4 = x3 + 10*divided_part - 1
if x2 > number:
x2 = number
if x4 > number:
x4 = number
if x1 == x3 and x2 == x4:
continue
old = os.path.join(path,"第%d至%d章"%(x1,x2))
new = new = os.path.join(path,"第%d至%d章"%(x3,x4))
destination.append([old, new])
return destination
def execute_(self, path):
number = len(os.listdir(path))
print(number)
if number <=10:
return
for folder in self.create_folder(number,path):
try:
os.makedirs(folder)
except Exception as e:
print(e)
print("folder created")
for txt_path in self.moving_txt_files(number,path):
try:
shutil.move(txt_path[0], txt_path[1])
except Exception as e:
print(e)
print("txt files moved into folder")
for folder_path in self.moving_folder_files(number,path):
try:
shutil.move(folder_path[0], folder_path[1])
except Exception as e:
print(e)
print("All complete")
#=======================================================================================================================================================================================================================================================
class Novel:
def __init__(self, url):
self.url = url
self.headers = {"User-Agent": 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'}
def create_path(self, novel_title):
path = "/content/drive/MyDrive/小說/%s/"%(novel_title)
try:
os.listdir(path)
except:
os.makedirs(path)
return path
def base64(self, _Str):
staticchars = "PXhw7UT1B0a9kQDKZsjIASmOezxYG4CHo5Jyfg2b8FLpEvRr3WtVnlqMidu6cN"
encodechars = ""
for i in range(len(_Str)):
try:
num0 = staticchars.index(_Str[i])
except:
num0 = -1
if num0 == -1:
code = _Str[i]
else:
code = staticchars[(num0+3)%62]
num1 = int(math.floor(random.random()*62))
num2 = int(math.floor(random.random()*62))
encodechars += (staticchars[num1] + code + staticchars[num2])
return encodechars
def getAll_download_information(self):
download_information = {}
response = requests.get(self.url, headers = self.headers)
soup = BeautifulSoup(response.text, "lxml")
download_information['novel_title'] = "_".join(soup.find("h1").getText().split(" "))
html_list = list(map(lambda x: "https://big5.mfxsydw.com" + x.find("a").get("href"), soup.find_all("li")))
id = self.url.split("/")[-1].split(".html")[0]
callback = re.findall("callback=(?:\'|\")(.*?)(?:\'|\")", response.text, re.MULTILINE).pop()
jsonUrl = "https://big5.mfxsydw.com/index.php?c=book&a=show.jsonp&callback={}&book_id={}&b={}".format(callback, id, self.base64(callback))
json_headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36",
"Referer": self.url}
r = requests.get(jsonUrl, headers = json_headers)
json_list = list(map(lambda x: "https://big5.mfxsydw.com" + x, re.findall("<a\s+href=(?:\"|\')(.*?)(?:\'|\")", "".join(r.text.split("\\")))))
download_list = sorted((html_list + json_list), key=lambda x: int(re.search("\/(\d+)\.html", x).group(1)))
download_information['download_list'] = list(dict.fromkeys(download_list))
return download_information
def run_command(self):
dowload_information = self.getAll_download_information()
path = self.create_path(dowload_information['novel_title'])
pool = multiprocessing.Pool(10)
queue = multiprocessing.Manager().Queue()
number = 1
for url in dowload_information['download_list']:
pool.apply_async(self.execute_download, args = (queue, url, number, path))
number += 1
pool.close()
self.progressBar(queue, dowload_information['download_list'])
return path
def execute_download(self, queue, href, number, path):
r = requests.get(href, headers = self.headers)
soup = BeautifulSoup(r.text, "lxml")
novel_header = soup.find("h1").getText()
content = list(map(lambda x: x.getText(), soup.find_all("p")))
novel_content = "\n\n".join(list(filter(lambda x: x!="上一頁" and x!= "目錄" and x!= "下一頁" and x!="免費小說閱讀網", content)))
if len(novel_content) < 40:
print(f"\n第{number}章: {novel_header} \nLess than 40 words")
with open(os.path.join(path, f"第{number}章.txt"), "w") as file:
file.write(novel_header + "\n\n" + novel_content)
file.flush()
queue.put(url)
def progressBar(self, queue, download_list):
for i in tqdm.tqdm(range(len(download_list))):
file = queue.get()
#====================================================================================================================================================================================================================================
if __name__ == "__main__":
url = "https://big5.mfxsydw.com/book/5939.html"
novel_path = Novel(url).run_command()
file_organization().execute_(novel_path)