admin管理员组文章数量:1122852
I am trying to scrape Yield tables for several countries and several maturities from a website. So far I only get empty tables:
while it should rather look like:
So far I have been doing the following:
import time
import datetime as dt
import pandas as pd
from bs4 import BeautifulSoup
from dateutil.relativedelta import relativedelta
import requests
import re
import os
path = os.getcwd()
def ZCCWord(Date,country):
# Site URL
url="/"+country
html_content = requests.get(url).text
soup = BeautifulSoup(html_content, "lxml")
#gdp = soup.find_all("table", attrs={"class": "w3-table w3-white table-padding-custom w3 small font-family-arial table-valign-middle"})
gdp = soup.find_all("table") # , attrs={"class": "w3-table money pd44 -f15"})
table1 = gdp[0]
body = table1.find_all("tr")
body_rows = body[1:]
all_rows = [] # will be a list for list for all rows
for row_num in range(len(body_rows)): # A row at a time
row = [] # this will old entries for one row
for row_item in body_rows[row_num].find_all("td"): #loop through all row entries
aa = re.sub("(\xa0)|(\n)|,","",row_item.text)
#append aa to row - note one row entry is being appended
row.append(aa)
# append one row to all_rows
all_rows.append(row)
AAA = pd.DataFrame(all_rows)
ZCC = pd.DataFrame()
ZCC = AAA[1].str.extract('([^a-zA-Z]+)([a-zA-Z]+)', expand=True).dropna().reset_index(drop=True)
ZCC.columns = ['TENOR', 'PERIOD']
ZCC['TENOR'] = ZCC['TENOR'].str.strip().str.isdigit() # Remove leading/trailing spaces
#ZCC = ZCC[ZCC['TENOR'].str.isdigit()]
ZCC['TENOR'] = ZCC['TENOR'].astype(int)
ZCC['RATES'] = AAA[2].str.extract(r'([0-9.]+)', expand=True).dropna().reset_index(drop=True).astype(float)
ZCC['RATES'] = ZCC['RATES']/100
row2 = []
for i in range(len(ZCC)):
if ZCC['PERIOD'][i]=='month' or ZCC['PERIOD'][i]=='months':
b = ZCC['TENOR'][i]
bb = Date + relativedelta(months = b)
row2.append(bb)
else:
b = ZCC['TENOR'][i]
bb = Date + relativedelta(years = b)
row2.append(bb)
ZCC['DATES'] = pd.DataFrame(row2)
ZCC = ZCC.reindex(['TENOR','PERIOD','DATES','RATES'], axis=1)
return ZCC
LitsCountries = ['spain','portugal','latvia','ireland','united-kingdom',
'germany', 'france','italy','sweden','finland','greece',
'poland','romania','hungary','netherlands']
todays_date = path+'\\WorldYields' +str(dt.datetime.now().strftime("%Y-%m-%d-%H-%M") )+ '.xlsx'
writer = pd.ExcelWriter(todays_date, engine='xlsxwriter',engine_kwargs={'options':{'strings_to_urls': False}})
dictYield = {}
for i in range(len(LitsCountries)):
country = LitsCountries[i]
Date = pd.to_datetime('today').date()
country = LitsCountries[i]
ZCC = ZCCWord(Date,country)
dictYield[i] = ZCC
ZCC.to_excel(writer, sheet_name=country)
writer.close()
time.sleep(60) # wait one minute
I would be fine also with other websites, solutions or methods which provide similar outputs. Any idea?
thanks in advance!
I am trying to scrape Yield tables for several countries and several maturities from a website. So far I only get empty tables:
while it should rather look like:
So far I have been doing the following:
import time
import datetime as dt
import pandas as pd
from bs4 import BeautifulSoup
from dateutil.relativedelta import relativedelta
import requests
import re
import os
path = os.getcwd()
def ZCCWord(Date,country):
# Site URL
url="http://www.worldgovernmentbonds.com/country/"+country
html_content = requests.get(url).text
soup = BeautifulSoup(html_content, "lxml")
#gdp = soup.find_all("table", attrs={"class": "w3-table w3-white table-padding-custom w3 small font-family-arial table-valign-middle"})
gdp = soup.find_all("table") # , attrs={"class": "w3-table money pd44 -f15"})
table1 = gdp[0]
body = table1.find_all("tr")
body_rows = body[1:]
all_rows = [] # will be a list for list for all rows
for row_num in range(len(body_rows)): # A row at a time
row = [] # this will old entries for one row
for row_item in body_rows[row_num].find_all("td"): #loop through all row entries
aa = re.sub("(\xa0)|(\n)|,","",row_item.text)
#append aa to row - note one row entry is being appended
row.append(aa)
# append one row to all_rows
all_rows.append(row)
AAA = pd.DataFrame(all_rows)
ZCC = pd.DataFrame()
ZCC = AAA[1].str.extract('([^a-zA-Z]+)([a-zA-Z]+)', expand=True).dropna().reset_index(drop=True)
ZCC.columns = ['TENOR', 'PERIOD']
ZCC['TENOR'] = ZCC['TENOR'].str.strip().str.isdigit() # Remove leading/trailing spaces
#ZCC = ZCC[ZCC['TENOR'].str.isdigit()]
ZCC['TENOR'] = ZCC['TENOR'].astype(int)
ZCC['RATES'] = AAA[2].str.extract(r'([0-9.]+)', expand=True).dropna().reset_index(drop=True).astype(float)
ZCC['RATES'] = ZCC['RATES']/100
row2 = []
for i in range(len(ZCC)):
if ZCC['PERIOD'][i]=='month' or ZCC['PERIOD'][i]=='months':
b = ZCC['TENOR'][i]
bb = Date + relativedelta(months = b)
row2.append(bb)
else:
b = ZCC['TENOR'][i]
bb = Date + relativedelta(years = b)
row2.append(bb)
ZCC['DATES'] = pd.DataFrame(row2)
ZCC = ZCC.reindex(['TENOR','PERIOD','DATES','RATES'], axis=1)
return ZCC
LitsCountries = ['spain','portugal','latvia','ireland','united-kingdom',
'germany', 'france','italy','sweden','finland','greece',
'poland','romania','hungary','netherlands']
todays_date = path+'\\WorldYields' +str(dt.datetime.now().strftime("%Y-%m-%d-%H-%M") )+ '.xlsx'
writer = pd.ExcelWriter(todays_date, engine='xlsxwriter',engine_kwargs={'options':{'strings_to_urls': False}})
dictYield = {}
for i in range(len(LitsCountries)):
country = LitsCountries[i]
Date = pd.to_datetime('today').date()
country = LitsCountries[i]
ZCC = ZCCWord(Date,country)
dictYield[i] = ZCC
ZCC.to_excel(writer, sheet_name=country)
writer.close()
time.sleep(60) # wait one minute
I would be fine also with other websites, solutions or methods which provide similar outputs. Any idea?
thanks in advance!
Share Improve this question edited 2 days ago Barmar 779k56 gold badges542 silver badges656 bronze badges asked 2 days ago Luca91Luca91 6091 gold badge8 silver badges21 bronze badges 4- I notice when going to that URL the data is not available immediately. I suppose it might be possible the data is not present when you start parsing it. You might try a print(body_rows) to show you. – jsf80238 Commented 2 days ago
- 3 That website is heavily reliant on JavaScript. You'll need to use selenium or something similar. requests with BeautifulSoup will not suffice – SIGHUP Commented 2 days ago
- 2 you CAN get the data with requests/bs4 from the wp-json endpoint. – GTK Commented 2 days ago
- @GTK many thanks for your help! Sorry but I am not familiar with scraping and request / beautifulsoup...could you please be more specific? – Luca91 Commented 2 days ago
1 Answer
Reset to default 2To get the table data you need to use the wp-json endpoint combined with the country id. You can retrieve the country id from the website you are already requesting by finding it in the raw text response.
Next you need to request the wp-json endpoint. There you will recieve a json object including the table html.
def request_table(country_id: str):
url = "https://www.worldgovernmentbonds.com/wp-json/country/v1/main"
payload = {
"GLOBALVAR":
{
"JS_VARIABLE": "jsGlobalVars",
"FUNCTION": "Country",
"DOMESTIC": True,
"ENDPOINT": "https://www.worldgovernmentbonds.com/wp-json/country/v1/historical",
"DATE_RIF": "2099-12-31",
"OBJ": None,
"COUNTRY1":
{
"SYMBOL": country_id
},
"COUNTRY2": None,
"OBJ1": None,
"OBJ2":None
}
}
headers = {
'accept': '*/*',
'content-type': 'application/json; charset=UTF-8',
'origin': 'https://www.worldgovernmentbonds.com',
}
response = requests.request("POST", url, headers=headers, data=json.dumps(payload))
data = response.json()
return data.get("mainTable")
def ZCCWord(Date, country):
# Site URL
url = "http://www.worldgovernmentbonds.com/country/" + country
html_content = requests.get(url).text
# extract country id
start_index = html_content.find("\"SYMBOL\":\"")
end_index = html_content[start_index + 10:].find("\",")
country_id = html_content[start_index + 10:start_index + 10 + end_index]
# request table
table_html = request_table(country_id)
soup = BeautifulSoup(table_html, "lxml")
# gdp = soup.find_all("table", attrs={"class": "w3-table w3-white table-padding-custom w3 small font-family-arial table-valign-middle"})
gdp = soup.find_all("table") # , attrs={"class": "w3-table money pd44 -f15"})
table1 = gdp[0]
# ... Rest of your code ...
本文标签: pandasPython BeautifulSoup scraping yield dataStack Overflow
版权声明:本文标题:pandas - Python: BeautifulSoup scraping yield data - Stack Overflow 内容由网友自发贡献,该文观点仅代表作者本人, 转载请联系作者并注明出处:http://www.betaflare.com/web/1736514166a1944345.html, 本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容,一经查实,本站将立刻删除。
发表评论