import tabula
import pandas as pd
from google.colab import drive
from google.colab import files
from google.colab import auth
auth.authenticate_user()
from oauth2client.client import GoogleCredentials
import re
from google.colab import files
drive.mount('/content/drive', force_remount=True)
global path
path = "/content/drive"
uploaded = files.upload()
filename = next(iter(uploaded))
df = tabula.read_pdf("/content/%s"%filename, pages = 'all')
result = pd.concat(df) #result = pandas dataframe
#read_excel=pd.read_excel("/content/drive/MyDrive/uploaded/%s.xlsx"%filename.split(".")[0], usecols=[5], names=None)
#read_excel.values.tolist() **change column F of table to list
desc=result["Component"].values.tolist()
data = [str(i).replace("nan","").replace("CPSIA lead in surface coating\rCCPSA heavy metal in surface\rcoating","") for i in desc]
cc=[f if re.match('.*\s+coating', f) else "" for f in data]
print(cc)
dd=[d if re.match('.*\s+[P|p]lastic', d) or re.match('.*\s+PVC', d) else "" for d in data]
print(dd)
e=[cc[i]+dd[i] for i in range(len(cc))]
print(e)
def get_indexes(list, element):
index_pos_list = [i for i in range(len(list)) if list[i] == element or element in list[i]]
return index_pos_list
def replace(index_list,list_tbm,string):
for index in index_list:
list_tbm[index] = string
return list_tbm
index_list = get_indexes(e, "plastic")
replace(index_list,e,"plastic")
index_list = get_indexes(e, "Plastic")
replace(index_list,e,"plastic")
index_list = get_indexes(e, "PVC")
replace(index_list,e,"PVC plastic")
index_list = get_indexes(e, "coating")
replace(index_list,e,"coating")
result["T"] = [aa.replace(' PVC','').replace(' plastic','').replace(' coating','').replace(' Plastic','') for aa in data if "PVC" or "plastic" or "coating" or "Plastic"]
result["S"] = e
result.to_excel('/content/drive/MyDrive/uploaded/%s_use.xlsx'%filename.split(".")[0])