01
2022
01

28 hse EXAMPLE - Python selenium + beautifulsoup combined crawler

# coding=utf-8
try:
  from selenium import webdriver
  from selenium.webdriver.common.by import By
  from selenium.webdriver.support.ui import WebDriverWait
  from selenium.webdriver.support import expected_conditions as EC

except:
  !pip install selenium
  !apt-get update 
  !apt install chromium-chromedriver
  !pip install fake_useragent


import time
import pickle
import pandas as pd
import csv
import os
from google.colab import drive
from google.colab import files
from google.colab import auth
auth.authenticate_user()
from oauth2client.client import GoogleCredentials
import requests
import lxml
from bs4 import BeautifulSoup
import re
import math

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException,NoSuchElementException,NoSuchFrameException
from selenium.webdriver.chrome.options import Options
from fake_useragent import UserAgent

ua = UserAgent()
usa = ua.random
print(usa)

drive.mount('/content/drive', force_remount=False)
path = "/content/drive/My Drive/"

#PROXY = "120.72.54.5:8080"

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
chrome_options.add_argument('--disable-blink-features=AutomationControlled')
chrome_options.add_argument(f'user-agent={usa}')
#chrome_options.add_argument('--proxy-server=http://%s' % PROXY)
wd = webdriver.Chrome('chromedriver',chrome_options=chrome_options)

wd.implicitly_wait(10)
wd.get("https://www.28hse.com/rent")
title = wd.title

L_price=[]
L_detail=[]
L_area=[]
headindex=[]

wd.find_element_by_xpath('//*[@id="searchBoxForm"]/div[1]/div[2]/div[31]/div[2]/div/a[2]').click()
time.sleep(1)
wd.find_element_by_xpath('//*[@id="searchBoxForm"]/div[1]/div[2]/div[39]/div[2]/div/a[2]').click()
time.sleep(2)
wd.find_element_by_xpath('/html/body/div[8]/div/div[2]/div/div/div[1]/input').send_keys("0")
time.sleep(1)

wd.find_element_by_xpath('/html/body/div[8]/div/div[2]/div/div/div[2]/input').send_keys("5000")
time.sleep(1)
wd.find_element_by_xpath('/html/body/div[8]/div/div[3]/div[2]').click()

time.sleep(1)

all_pages = wd.find_element_by_xpath('//*[@id="block_search_results"]/div[4]/div')
all_p = int(all_pages.text.split("\n")[-1])
time.sleep(2)
print(title)
print("共"+str(all_p)+"頁")


for i in range(1,all_p+1):
#for i in range(1,4):
  print("第"+str(i)+"頁")
  try:
    wd.find_element_by_xpath('//*[@id="block_search_results"]/div[4]/div/a[11]/i').click()
  except:
    try:
      wd.find_element_by_xpath('//*[@id="block_search_results"]/div[4]/div/a[12]/i').click()
    except:
        try:
          wd.find_element_by_xpath('//*[@id="block_search_results"]/div[4]/div/a[10]/i').click()
        except:
          wd.find_element_by_xpath('//*[@id="block_search_results"]/div[4]/div/a[9]/i').click()
    time.sleep(1)
  sc = wd.page_source
  time.sleep(1)
  soup = BeautifulSoup(sc, "lxml")
  title = soup.find_all("div", {"class""header wHoverBlue"})
  price = soup.find_all("div", {"class""ui right floated green large label"})
  area = soup.find_all("div", {"class""areaUnitPrice"})
  head = soup.find_all("div", {"class""item property_item "})
  for i in price:
    L_price.append(i.getText().strip())
  for i in title:
    L_detail.append(i.getText().strip())
  for i in area:
    L_area.append(i.getText().strip())
  
  for j in head:
    if "已租" in j.getText().strip():
      headindex.append(j.getText().strip())
    else:
      headindex.append("")

#print(headindex)
index=[]
for i in headindex:
  if i != '':
    index.append(i)
  else:
    pass
#print(index)
ii=[headindex.index(i) for i in index]
print(ii)
for i in ii:
  L_price.insert(i, '已租')
print(L_price)
print(L_detail)
print(L_area)
print(len(L_price))
print(len(L_detail))
print(len(L_area))

res = pd.DataFrame({'價格': L_price, '詳細': L_detail, '面積': L_area}, columns=['價格''詳細''面積'])

res.to_csv("/content/drive/My Drive/rent.csv", encoding='utf_8_sig', index=False)


« 上一篇 下一篇 »

发表评论:

◎欢迎参与讨论,请在这里发表您的看法、交流您的观点。