26
2020
10

Google sheet crawler

# -*- coding: UTF-8 -*-

import gspread

from oauth2client.service_account import ServiceAccountCredentials

import requests

import lxml

from bs4 import BeautifulSoup



scopes = ["https://spreadsheets.google.com/feeds"]

credentials = ServiceAccountCredentials.from_json_keyfile_name("credentials.json", scopes)

client = gspread.authorize(credentials)

sheet = client.open_by_key("1q9EJlySLhEqMuJm7bX3NGgys-v_3xqB-nFYwgrNJGM8").sheet1



def getlink():

    page_links_list=[]

    pages=input("separate with space without comma").split()

#    pages=["2415","2514"]

    for page in pages:

        url="https://tw.stock.yahoo.com/q/q?s="+str(page)

        page_links_list.append(url)

    

    return page_links_list


#print(getlink())


for i in getlink():

print(i)



response = requests.get("https://tw.stock.yahoo.com/q/q?s=2451")

soup = BeautifulSoup(response.text, "lxml")

tables = soup.find_all("table")[1]

ths = tables.find_all("th")[0:11]

thx = tables.find_all("td")[1:12]

titles = ("Date of data",) + tuple(th.getText() for th in ths)

data = ("Date",) + tuple(da.getText() for da in thx)


sheet.clear()

sheet.insert_row(titles, 1) #write to row no. 1

sheet.insert_row(data, 2)

print(sheet.get_all_values())



















credentials.json of your root directory


{

  "type": "service_account",

  "project_id": "second-abacus-293014",

  "private_key_id": "848dd0b6ad098058ddb202e3d2ee60b81835b18f",

  "private_key": "-----BEGIN PRIVATE KEY-----\nMIIEvgIBADANBgkqhkiG9w0BAQEFAASCBKgwggSkAgEAAoIBAQC6FfAKjS1Gw3jM\nqeyZDqoKsqafCM6GbZx30GbZjmsbAr4ws1JOnBo8xDhlC4ZgMr3LMxN1O691JiWA\nmbdBeRphrEI5sgDvfIdo+Rq/N92BEVDOCvh069B9MTCXK7QWR/3qczpJfI/pxhLn\nCnjCBtMLL9+uNkTa8E0wj+pcTa4LuGsCRkmmXevNkESZF7Hvxqq+QUBcH6e5nuNK\ndWyn/RXRPc+QdxLTfn9UP4vkN7iN8ojcot91hqRWn3sZlJx03h3wYxrBl2QYLUBK\nwc8h8+aoritvzRhXErZ3VcYEEzmlsQrg6NQvWiouvhAmW9ZGq0bs5HsBHWAxc9gK\nrc4gCtWdAgMBAAECggEABDGcYFCANbkQDuTBfU2UTXtrP+hN5z+vKBAdUUfhaBFu\nweA4uabQ/I+bicNxutdJ6btz889ti+FZNQXB2IrnKk5ci7pURRF0Z8w9axPSPSdr\nv8kFgnUSFNA3/SAyBdKHiXIhXSiegbFCGczlndHTjcv9aYq1kI2b371Oldf2VTM/\n95ym4+wRUMUqs1Xnc8z/7KNzuOlR6pcDSBVUYN/6Dt+b/sg+NV4jSq5h+05m+qPN\nMPwD5rxJhaDy3s8XXQOlNjHK9+p65bRgqxySqO63+PfTyAhXRadS1oYkgGxrHpZs\n3X/FJdeh/KEvm0h/CP7Ee4BTU8G3epOnwfsdGxYBYQKBgQDaTcvFsE8Pu7rzajA1\nAkJKZcHSnCYPQj7oQrDUfU7wFzlJBWBDvsqF03OB+qX27uHFS/JlB7ivkS73Y1H+\npR44tnItaDNYW1ta+DBwqzdIFHT4njlg8GcpxbgYSVbyATeAgoewz0Y4C/8/iR0S\n/MLdBx/erDj0SWPEWK7cNG8L+QKBgQDaN+ymLgyQs6JzlnKH5fxdSd+PqJGhfqq0\npN3+OCNTgpyNwUxeJBGg9zvkLnxjU1xc0iuO7KUuJ/q9pO83tIlCVEke+Gy+k4Vl\nJqzLOVB7Rm9+ENG2tW3vKx6gocpl+SLMw9Zd2k9Amy0CGopzTQ/ZTGjHTUImLeOl\n3ui+lO1XxQKBgAIbyEETj9/uB0M8nlSUvgEr9HBNU8zH7gWeR0j2IOIOx13eMGzX\nFeuu/CmgOyQkcIfQUPoK1gzUESkQi4UzIRSZJocrv3s1/q7lQMqFzNXkZLDWDmXS\nTJzJrRqG8tEJk/ioyhi6F8qIyyFY+K/rQxKRdAxZi39Iii8mUCIOW5g5AoGBAJF4\noJjzr8qzBcP6cmVoCwGBorhZDqRTFaeRFLBJajtcodLaHnfPobTlvBZUavWn97Wl\nIgaIeWE4o63TCunev3XpTEmKIKw3GcEhtCVSEvXrMZ4fxj3/2saKv0MkOBIcsHdC\nwmAUjaqrB6kUWmGc6MjaaCsctNFQ3CaSYCPUkhhlAoGBAIEpsT4DupHqZApKFjqb\neQ9PJeVeKXcbzrS3vZt2b0nZwrM5m2e3hQvw5EK+XwQ+629ktBkNlMd7QWG/g6VU\nR3tMb4QNlXu/xa9Rf5SGMQFGTEbj8GCsIgNiBwkZVgpzAtoZtdO1zqsgVGKhQ1Rl\nYE7WCANinL2nPD34xHoNVDgz\n-----END PRIVATE KEY-----\n",

  "client_email": "aixyn1khd-a8kxjeyhyynbmj@second-abacus-293014.iam.gserviceaccount.com",

  "client_id": "116014604720294077792",

  "auth_uri": "https://accounts.google.com/o/oauth2/auth",

  "token_uri": "https://oauth2.googleapis.com/token",

  "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",

  "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/aixyn1khd-a8kxjeyhyynbmj%40second-abacus-293014.iam.gserviceaccount.com"

}





google colab

https://colab.research.google.com/drive/1A4-QMnk0I86qbTs0QJJvpH2n4aMdMLyB?usp=sharing



sample:


# -*- coding: UTF-8 -*-

import gspread

from oauth2client.service_account import ServiceAccountCredentials

import requests

import lxml

from bs4 import BeautifulSoup



scopes = ["https://spreadsheets.google.com/feeds"]

credentials = ServiceAccountCredentials.from_json_keyfile_name("credentials.json", scopes)

client = gspread.authorize(credentials)

sheet = client.open_by_key("1q9EJlySLhEqMuJm7bX3NGgys-v_3xqB-nFYwgrNJGM8").sheet1


sheet.clear()

response = requests.get('https://tw.stock.yahoo.com/q/q?s=2415')

soup = BeautifulSoup(response.text, "lxml")

tables = soup.find_all("table")[1]

ths = tables.find_all("th")[0:11]

titles = ("資料日期",) + tuple(th.getText() for th in ths)

sheet.insert_row(titles, 1)    


def getlink():

    page_links_list=[]

    pages=input("separate with space without comma!").split()

    for page in pages:

        url="https://tw.stock.yahoo.com/q/q?s="+str(page)

        page_links_list.append(url)

    return page_links_list



for i in getlink():

    response = requests.get(i)

    soup = BeautifulSoup(response.text, "lxml")

    tables = soup.find_all("table")[1]

    thx = tables.find_all("td")[1:12]

    data = ("--",) + tuple(da.getText() for da in thx)

    sheet.insert_row(data, 2)


    print(sheet.get_all_values())









Colab example:


from google.colab import auth

auth.authenticate_user()

import requests

import lxml

from bs4 import BeautifulSoup

import gspread

from oauth2client.client import GoogleCredentials


gc = gspread.authorize(GoogleCredentials.get_application_default())


sh = gc.create('A new spreadsheet')


# Open our new sheet and add some data.

worksheet = gc.open('your spreadsheet name').sheet1



url="https://tw.stock.yahoo.com/q/q?s=2415"

response = requests.get(url)

soup = BeautifulSoup(response.text, "lxml")

tables = soup.find_all("table")[1]

a = tables.find_all("th")[0:11]

b=[i.getText() for i in a]


worksheet.insert_row(b, 1)


« 上一篇 下一篇 »

发表评论:

◎欢迎参与讨论,请在这里发表您的看法、交流您的观点。