'''PDF to image'''
import os
import time
import re
from IPython.display import clear_output
from PIL import Image
import tqdm
import multiprocessing
try:
from pdf2image import pdfinfo_from_path, convert_from_path
except:
!pip install pdf2image
!sudo apt-get install poppler-utils
from pdf2image import pdfinfo_from_path, convert_from_path
clear_output()
from google.colab import drive
drive.mount('/content/drive', force_remount=False)
Image.MAX_IMAGE_PIXELS = None
def _processing(path, save_path, page, queue):
images = convert_from_path(path, dpi=72, first_page=page, last_page=page)
images[0].save(os.path.join(save_path, str(page) + '---'+ path.split("/")[-1].split(".")[0] + '.jpg'), 'JPEG')
queue.put("done")
your_pdf_name = "100---五等分的花嫁漫畫_第100話,試看版_在線漫畫閱讀_漫畫人1_6325.pdf"
pdf_path = f"/content/drive/MyDrive/{your_pdf_name}"
save_path = "/content/drive/MyDrive/"
info = pdfinfo_from_path(pdf_path, userpw=None, poppler_path=None)
maxPages = info["Pages"]
pool = multiprocessing.Pool(10)
queue = multiprocessing.Manager().Queue()
for page in range(1, maxPages+1):
pool.apply_async(_processing, args=(pdf_path, save_path, page, queue, ))
pool.close()
for i in tqdm.tqdm(range(maxPages)):
file = queue.get()
print("finished")
=====================================================================================================================
'''image to PDF'''
try:
from fpdf import FPDF
import PyPDF2
except:
!pip install pillow fpdf
!pip install PyPDF2
from fpdf import FPDF
import PyPDF2
import os
import threading
from PIL import Image
import time
from google.colab import drive
from google.colab import files
from google.colab import auth
drive.mount('/content/drive', force_remount=True)
# 图片文件夹路径
image_folder = '/content/drive/My Drive/images/'
# 输出PDF文件夹路径
output_folder = '/content/drive/My Drive/pdf/'
if not os.path.exists(image_folder):
os.makedirs(image_folder)
print("images已创建")
else:
pass
#print("images已存在")
files = os.listdir(image_folder)
if len(files) > 0:
print("images档案内有图片档案")
else:
print("images档案内没有图片档案,请upload图片到Gdrv/images内!")
exit()
if not os.path.exists(output_folder):
os.makedirs(output_folder)
print("pdf已创建")
else:
pass
#print("pdf已存在")
# 转换图片为PDF的函数
def convert_to_pdf(image_path, output_path):
image = Image.open(image_path)
#pdf = FPDF()
img = [os.path.join(image_folder, image_file) for image_file in image_files]
cover = Image.open(img[0])
width, height = cover.size
pdf = FPDF(unit="pt", format=[width,height])
pdf.add_page()
pdf.image(image_path, 0, 0)
pdf.output(output_path, "F")
# 多线程处理图片转换为PDF
def process_images(image_files):
threads = []
for image_file in image_files:
image_path = os.path.join(image_folder, image_file)
output_path = os.path.join(output_folder, os.path.splitext(image_file)[0] + '.pdf')
thread = threading.Thread(target=convert_to_pdf, args=(image_path, output_path))
thread.start()
threads.append(thread)
# 等待所有线程完成
for thread in threads:
thread.join()
# 合并多个PDF文件为一个PDF文件
def merge_pdfs(pdf_files, output_path):
merger = PyPDF2.PdfMerger()
for filename in pdf_files:
file_path = os.path.join(output_folder, filename)
merger.append(file_path)
output_path = output_path
with open(output_path, 'wb') as output_file:
merger.write(output_file)
# 获取图片文件列表
image_files = os.listdir(image_folder)
# 多线程转换图片为PDF
process_images(image_files)
# 获取转换后的PDF文件列表
pdf_files = [os.path.join(output_folder, os.path.splitext(image_file)[0] + '.pdf') for image_file in image_files]
sorted_pdf_files = sorted(pdf_files,key=lambda x: x.lower())
# 合并PDF文件
merge_pdfs(sorted_pdf_files, '/content/drive/My Drive/pdf/merged.pdf')
#def getSize():
#img = [os.path.join(image_folder, image_file) for image_file in image_files]
#for i in range(len(img)):
#cover = Image.open(img[1])
#width, height = cover.size
#print(width, height)
==============================================================================================================
'''merge PDF'''
try:
from fpdf import FPDF
import PyPDF2
except:
!pip install pillow fpdf
!pip install PyPDF2
from fpdf import FPDF
import PyPDF2
import os
import threading
from PIL import Image
import time
from google.colab import drive
from google.colab import files
from google.colab import auth
drive.mount('/content/drive', force_remount=True)
def compress_pdf(input_path, output_path):
pdf = PyPDF2.PdfReader(input_path)
writer = PyPDF2.PdfWriter()
for page_num in range(len(pdf.pages)):
page = pdf.pages[page_num]
page.compress_content_streams
writer.add_page(page)
with open(output_path, "wb") as f:
writer.write(f)
input_file = '/content/drive/My Drive/pdf/merged.pdf'
output_file = '/content/drive/My Drive/pdf/cp_merged.pdf'
compress_pdf(input_file, output_file)