16
2021
09

image to text ++++++++ pdf table format to Excel table


#######################image to text for colab only


!sudo apt install tesseract-ocr
!pip install pytesseract



import pytesseract

import shutil
import os
import random
from google.colab import drive
from google.colab import files
from google.colab import auth
auth.authenticate_user()
from oauth2client.client import GoogleCredentials
drive.mount('/content/drive', force_remount=True)
global path
path = "/content/drive"
try:
 from PIL import Image
except ImportError:
 import Image

from google.colab import files


uploaded = files.upload()

filename = next(iter(uploaded))


image_path_in_colab=uploaded

extractedInformation = pytesseract.image_to_string(Image.open("/content/%s"%filename), lang="eng")

print(extractedInformation)


# French text image to string

#extractedInformation = pytesseract.image_to_string(Image.open('test-european.jpg'), lang='fra')

#print(extractedInformation)







or !pip install easy ocr



import easyocr

import os

from google.colab import drive

from google.colab import files

from google.colab import auth

auth.authenticate_user()

from oauth2client.client import GoogleCredentials


drive.mount('/content/drive', force_remount=False)


path = "/content/drive"


from google.colab import files

uploaded = files.upload()

filename = next(iter(uploaded))

# 创建reader对象

reader = easyocr.Reader(['ch_tra','en'])  #ch_sim

# 读取图像

result = reader.readtext('/content/%s'%filename)

# 结果

#print(result)


for i in result:

    word = i[1]

    print(word)







#######################pdf table format to Excel table for colab only


!pip install tabula-py
!pip install pandas

#先安裝上面兩個modules


import tabula
import pandas as pd
from google.colab import drive
from google.colab import files
from google.colab import auth
auth.authenticate_user()
from oauth2client.client import GoogleCredentials
drive.mount('/content/drive', force_remount=True)
global path
path = "/content/drive"
 
from google.colab import files
uploaded = files.upload()    #上傳檔案到default 路徑 /content/
filename = next(iter(uploaded))

df = tabula.read_pdf('/content/%s'%filename, pages = 'all')  #tabula讀取所有pdf既分頁
#df[0] = table 1 = pdf page 1, df[1] = table 2 = pdf page 2

result = pd.concat(df)
result.to_excel('/content/drive/MyDrive/uploaded/%s.xlsx'%filename.split(".")[0])


print(result)






!pip install pdfplumber



import pdfplumber import pandas as pd from google.colab import drive from google.colab import files from google.colab import auth auth.authenticate_user() from oauth2client.client import GoogleCredentials drive.mount('/content/drive', force_remount=True) global path path = "/content/drive" from google.colab import files uploaded = files.upload()    #上傳檔案到default 路徑 /content/ filename = next(iter(uploaded)) df = pdfplumber.open('/content/%s'%filename) #print(len(df.pages)) all_table=[pd.DataFrame(df.pages[i].extract_table()) for i in range(len(df.pages))] table_df = pd.concat(all_table) # 将列表转为df print(table_df) # 保存excel table_df.to_excel('/content/drive/MyDrive/uploaded/%s_use.xlsx'%filename.split(".")[0])

« 上一篇 下一篇 »

发表评论:

◎欢迎参与讨论,请在这里发表您的看法、交流您的观点。