#######################image to text for colab only
import pytesseract
from google.colab import files
uploaded = files.upload()
filename = next(iter(uploaded))
image_path_in_colab=uploaded
extractedInformation = pytesseract.image_to_string(Image.open("/content/%s"%filename), lang="eng")
print(extractedInformation)
# French text image to string
#extractedInformation = pytesseract.image_to_string(Image.open('test-european.jpg'), lang='fra')
#print(extractedInformation)
or !pip install easy ocr
import easyocr
import os
from google.colab import drive
from google.colab import files
from google.colab import auth
auth.authenticate_user()
from oauth2client.client import GoogleCredentials
drive.mount('/content/drive', force_remount=False)
path = "/content/drive"
from google.colab import files
uploaded = files.upload()
filename = next(iter(uploaded))
# 创建reader对象
reader = easyocr.Reader(['ch_tra','en']) #ch_sim
# 读取图像
result = reader.readtext('/content/%s'%filename)
# 结果
#print(result)
for i in result:
word = i[1]
print(word)
#######################pdf table format to Excel table for colab only
#先安裝上面兩個modules
!pip install pdfplumber
import pdfplumber
import pandas as pd
from google.colab import drive
from google.colab import files
from google.colab import auth
auth.authenticate_user()
from oauth2client.client import GoogleCredentials
drive.mount('/content/drive', force_remount=True)
global path
path = "/content/drive"
from google.colab import files
uploaded = files.upload() #上傳檔案到default 路徑 /content/
filename = next(iter(uploaded))
df = pdfplumber.open('/content/%s'%filename)
#print(len(df.pages))
all_table=[pd.DataFrame(df.pages[i].extract_table()) for i in range(len(df.pages))]
table_df = pd.concat(all_table) # 将列表转为df
print(table_df)
# 保存excel
table_df.to_excel('/content/drive/MyDrive/uploaded/%s_use.xlsx'%filename.split(".")[0])