20
2020
09

Crawling data and save to CSV /Excel

# -*- coding: UTF-8 -*-

import requests

import pandas as pd

import lxml

from bs4 import BeautifulSoup

import time

import random

import csv

#import codecs

#import unicodecsv as csv


name, score, comment = [], [], []


URL = 'https://ithelp.ithome.com.tw/articles?tab=tech'


headers = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'}


res = requests.get(URL, headers=headers)


soup = BeautifulSoup(res.text, 'lxml')


data1 = soup.find_all('h3', {"class": "qa-list__title"})


for d1 in data1:

    comment.append(d1.text)


data2 = soup.find_all('div', {"class": "qa-list__condition"})


for d2 in data2:

    score.append(d2.text.strip("\n\n0\nLike\n\n0\n").strip())


data3 = soup.find_all('div', {"class": "qa-list__info"})


for d3 in data3:

    name.append(d3.text)


#pandas 写入excel格式

res = pd.DataFrame({'日期': name, '瀏覽': score, '評價': comment}, columns=['日期', '瀏覽', '評價'])

res.to_csv("save.csv", encoding='utf_8_sig', index=False)



#URL = ''

#html = requests.get(URL).text

#soup = BeautifulSoup(html,'lxml')

#if requests.get(url).status_code == 200:

#    pass

#else print("error")



#def get(page):

#    url = 'https://movie.douban.com/subject/6390825/comments?start=+str(page)+"&limit=20&sort=new_score&status=P"' % (page * 20)


#    for i in range(1, 21):

#        name.append(response.xpath('//*[@id="comments"]/div[%s]/div[2]/h3/span[2]/a' % (i))[0].text)

#        score.append(response.xpath('//*[@id="comments"]/div[%s]/div[2]/h3/span[2]/span[2]' % (i))[0].attrib['class'][7])

 #       comment.append(response.xpath('//*[@id="comments"]/div[%s]/div[2]/p' % (i))[0].text)








example


import gspread_dataframe as gd


from google.colab import auth

auth.authenticate_user()


import requests

import lxml

from bs4 import BeautifulSoup


import pandas as pd


import gspread


from oauth2client.client import GoogleCredentials


gc = gspread.authorize(GoogleCredentials.get_application_default())


sh = gc.create('A new spreadsheet')


# Open our new sheet and add some data.

worksheet = gc.open('A new spreadsheet').sheet1


url="https://tw.stock.yahoo.com/q/q?s=2415"

response = requests.get(url)

soup = BeautifulSoup(response.text, "lxml")

tables = soup.find_all("table")[1]

a = tables.find_all("th")[0:11]

c=[]

b=[c.append(i.getText()) for i in a]


df = pd.DataFrame()

df['年份'] = ['增量', '名义增长率', '排名', '排名变化']


or


df = pd.DataFrame({"id":[1001,1002,1003,1004,1005,1006], "date":pd.date_range('20130102', periods=6), "city":['Beijing ', 'SH', ' guangzhou ', 'Shenzhen', 'shanghai', 'BEIJING '], "age":[23,44,54,32,34,32], "category":['100-A','100-B','110-A','110-C','210-A','130-F'], "price":[1200,2133,5433,4432,12356,45678]},columns =['id','date','city','category','age','price'])


print(df)

gd.set_with_dataframe(worksheet, df)



gd.set_with_dataframe(worksheet, df)

« 上一篇 下一篇 »

发表评论:

◎欢迎参与讨论,请在这里发表您的看法、交流您的观点。