# coding=utf-8
try:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
except:
!pip install selenium
!apt-get update
!apt install chromium-chromedriver
!pip install fake_useragent
import time
import pickle
import pandas as pd
import csv
import os
from google.colab import drive
from google.colab import files
from google.colab import auth
auth.authenticate_user()
from oauth2client.client import GoogleCredentials
import requests
import lxml
from bs4 import BeautifulSoup
import re
import math
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException,NoSuchElementException,NoSuchFrameException
from selenium.webdriver.chrome.options import Options
from fake_useragent import UserAgent
ua = UserAgent()
usa = ua.random
print(usa)
drive.mount('/content/drive', force_remount=False)
path = "/content/drive/My Drive/"
#PROXY = "120.72.54.5:8080"
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
chrome_options.add_argument('--disable-blink-features=AutomationControlled')
chrome_options.add_argument(f'user-agent={usa}')
#chrome_options.add_argument('--proxy-server=http://%s' % PROXY)
wd = webdriver.Chrome('chromedriver',chrome_options=chrome_options)
wd.implicitly_wait(10)
wd.get("https://www.28hse.com/rent")
title = wd.title
L_price=[]
L_detail=[]
L_area=[]
headindex=[]
wd.find_element_by_xpath('//*[@id="searchBoxForm"]/div[1]/div[2]/div[31]/div[2]/div/a[2]').click()
time.sleep(1)
wd.find_element_by_xpath('//*[@id="searchBoxForm"]/div[1]/div[2]/div[39]/div[2]/div/a[2]').click()
time.sleep(2)
wd.find_element_by_xpath('/html/body/div[8]/div/div[2]/div/div/div[1]/input').send_keys("0")
time.sleep(1)
wd.find_element_by_xpath('/html/body/div[8]/div/div[2]/div/div/div[2]/input').send_keys("5000")
time.sleep(1)
wd.find_element_by_xpath('/html/body/div[8]/div/div[3]/div[2]').click()
time.sleep(1)
all_pages = wd.find_element_by_xpath('//*[@id="block_search_results"]/div[4]/div')
all_p = int(all_pages.text.split("\n")[-1])
time.sleep(2)
print(title)
print("共"+str(all_p)+"頁")
for i in range(1,all_p+1):
#for i in range(1,4):
print("第"+str(i)+"頁")
try:
wd.find_element_by_xpath('//*[@id="block_search_results"]/div[4]/div/a[11]/i').click()
except:
try:
wd.find_element_by_xpath('//*[@id="block_search_results"]/div[4]/div/a[12]/i').click()
except:
try:
wd.find_element_by_xpath('//*[@id="block_search_results"]/div[4]/div/a[10]/i').click()
except:
wd.find_element_by_xpath('//*[@id="block_search_results"]/div[4]/div/a[9]/i').click()
time.sleep(1)
sc = wd.page_source
time.sleep(1)
soup = BeautifulSoup(sc, "lxml")
title = soup.find_all("div", {"class": "header wHoverBlue"})
price = soup.find_all("div", {"class": "ui right floated green large label"})
area = soup.find_all("div", {"class": "areaUnitPrice"})
head = soup.find_all("div", {"class": "item property_item "})
for i in price:
L_price.append(i.getText().strip())
for i in title:
L_detail.append(i.getText().strip())
for i in area:
L_area.append(i.getText().strip())
for j in head:
if "已租" in j.getText().strip():
headindex.append(j.getText().strip())
else:
headindex.append("")
#print(headindex)
index=[]
for i in headindex:
if i != '':
index.append(i)
else:
pass
#print(index)
ii=[headindex.index(i) for i in index]
print(ii)
for i in ii:
L_price.insert(i, '已租')
print(L_price)
print(L_detail)
print(L_area)
print(len(L_price))
print(len(L_detail))
print(len(L_area))
res = pd.DataFrame({'價格': L_price, '詳細': L_detail, '面積': L_area}, columns=['價格', '詳細', '面積'])
res.to_csv("/content/drive/My Drive/rent.csv", encoding='utf_8_sig', index=False)