# coding=utf-8
try:
import xlsxwriter
except:
!pip install xlsxwriter
!pip install selenium
!apt-get update
!apt install chromium-chromedriver
import xlsxwriter
import os
from google.colab import drive
from google.colab import files
from google.colab import auth
auth.authenticate_user()
from oauth2client.client import GoogleCredentials
import requests
import lxml
from bs4 import BeautifulSoup
import time
import sqlite3
import pandas as pd
import re
import math
import shutil
import xlrd
import openpyxl
from pathlib import Path
import random
from functools import reduce
from operator import concat
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException,NoSuchElementException,NoSuchFrameException
drive.mount('/content/drive', force_remount=False)
path = "/content/drive/My Drive/"
#==============================================================================get Proxy Data 1=================================================================================================
url = "https://free-proxy-list.net/"
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'}
response = requests.get(url, headers=headers)
response.encoding = 'utf8'
soup = BeautifulSoup(response.text,"lxml")
content = soup.prettify()
text = "".join(list(map(lambda x: str(x).strip(), content.split('\n'))))
datas = re.findall("<tr>.*?</tr>",text)
column_data = re.findall("<th.*?</th>",datas[0])
revised_column_data = list(map(lambda x: re.sub("<th>|<th|</th>|class=.*?>","",x).strip(),column_data))
allData =[]
allData.append(revised_column_data)
Information = []
for data in datas:
Information.append(list(map(lambda x: re.sub("<td>|<td|</td>|class=.*?>","",x).strip(),re.findall("<td.*?</td>",data))))
for k in Information:
if len(k) == 0:
continue
if re.sub("\d+\.\d+\.\d+\.\d+","",k[0]) !='':
break
allData.append(k)
Proxy = list(map(lambda x: reduce(concat,[[x[0]+":"+x[1]],x[2:]]),allData))
file_name = "{}free_proxy_list.xlsx".format(path)
workbook = xlsxwriter.Workbook(file_name)
worksheet = workbook.add_worksheet('sheet1')
for i in range(len(Proxy)):
worksheet.write_row(i,0,Proxy[i])
workbook.close()
#=================================================================================get Proxy Data 2============================================================================================
path = "/content/drive/My Drive/"
url = "https://free-proxy-list.net/"
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
wd = webdriver.Chrome('chromedriver',chrome_options=chrome_options)
wd.implicitly_wait(10)
try:
wd.get(url)
header = wd.execute_script("""List = document.querySelector("table > thead > tr").innerText.split("\t");
List[0] = List.shift()+":"+List[0];
return List""")
textContent = wd.execute_script('return document.querySelector("table > tbody").innerText')
except TimeoutException as e:
print(e)
except NoSuchFrameException as e:
print(e)
finally:
wd.quit()
allData = list(map(lambda x: str(x).split("\t"),textContent.split("\n")))
Proxy = list(map(lambda x: reduce(concat,[[x[0]+":"+x[1]],x[2:]]),allData))
file_name = "{}free_proxy_list2.xlsx".format(path)
workbook = xlsxwriter.Workbook(file_name)
worksheet = workbook.add_worksheet('sheet1')
worksheet.write_row(0,0,header)
for i in range(len(Proxy)):
worksheet.write_row((i+1),0,Proxy[i])
workbook.close()
#=================================================================================Read Data============================================================================================
df = pd.read_excel("{}free_proxy_list.xlsx".format(path),index_col=None, header=None) # read data from column
allData_Column = []
def ToEmptyValues(x):
if str(x) == "nan":
return ""
else:
return x
for i in df:
columnData = []
for item in df[i]:
columnData.append(item)
allData_Column.append(columnData)
allData_Column = list(map(lambda x: list(map(ToEmptyValues, x)),allData_Column))
#print(allData_Column[0])
allData_Column[0].remove(allData_Column[0][0])
print(allData_Column[0])