try:
from fake_useragent import UserAgent
except:
!pip install fake_useragent
import threading
import requests
import queue
from fake_useragent import UserAgent
import os
from google.colab import drive
from google.colab import files
from google.colab import auth
auth.authenticate_user()
from oauth2client.client import GoogleCredentials
import lxml
from bs4 import BeautifulSoup
import time
import pandas as pd
from multiprocessing.dummy import Pool
import time
import pickle
import pandas as pd
import csv
drive.mount('/content/drive', force_remount=False)
global path
path = "/content/drive/MyDrive/"
df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/bulk_proxy/China_Proxy.csv', error_bad_lines=False)
test_list=df['IP&PORT_China'].tolist()
ip_pool = []
#Random head
ua = UserAgent()
headers = {'User-Agent':ua.random}
url = 'http://icanhazip.com/'
def test_ip(queue_list):
while True:
if queue_list.empty():
break
else:
ip = queue_list.get()
proxies = {
'http' : ip
}
try:
response = requests.get(url=url, headers=headers, proxies=proxies,timeout=3)
if response.status_code == 200:
print("[%s]test%s,Test results [available]" % (threading.current_thread().name, proxies))
ip_pool.append(ip)
except:
print("[%s]test%s,Test results [not available]" % (threading.current_thread().name, proxies))
if __name__ == '__main__':
queue_list = queue.Queue()#Create queue
#Put the crawled ip into the queue
for i in test_list:
queue_list.put(i)
#Create thread
out_thread = [threading.Thread(target=test_ip, args=(queue_list,), name="process%s" % item) for item in range(9)]
for thread in out_thread:
thread.start()
for thread in out_thread:
thread.join()
print('Test completed')
print(ip_pool)
res = pd.DataFrame({'available_IP&PORT': ip_pool}, columns=['available_IP&PORT'])
res.to_csv("/content/drive/My Drive/Colab Notebooks/AvailableIP.csv", encoding='utf_8_sig', index=False)
print("Total crawl%s individual ip,available ip For:%s,Unavailable ip For:%s"%(len(test_list),len(ip_pool),len(test_list)-len(ip_pool)))