zennn077/India_budget
收藏Hugging Face2024-02-08 更新2024-03-04 收录
下载链接:
https://hf-mirror.com/datasets/zennn077/India_budget
下载链接
链接失效反馈官方服务:
资源简介:
!pip install requests-html
import requests
from bs4 import BeautifulSoup
import csv
# Function to scrape data from the website
def scrape_website(url):
# Send a GET request to the URL
response = requests.get(url)
# Check if the request was successful
if response.status_code == 200:
# Parse the HTML content
soup = BeautifulSoup(response.content, 'html.parser')
# Find the press release content
press_release_content = soup.find('div', {'id': 'divPressRelease'})
# Extract the title and content
title = press_release_content.find('h1').text.strip()
content = press_release_content.find('div', {'class': 'pressreldetail'}).text.strip()
return title, content
else:
print("Failed to retrieve data from the website.")
return None, None
# Main function
def main():
# URL of the website to scrape
url = 'https://www.pib.gov.in/PressReleasePage.aspx?PRID=1895315'
# Scrape data from the website
title, content = scrape_website(url)
# Write the scraped data to a CSV file
if title and content:
with open('scraped_data.csv', 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(['Title', 'Content'])
writer.writerow([title, content])
print("Scraped data has been saved to 'scraped_data.csv'.")
else:
print("No data was scraped.")
# 执行以下命令安装requests-html库:`!pip install requests-html`
import requests
from bs4 import BeautifulSoup
import csv
# 定义网站数据抓取函数
def scrape_website(url):
# 向目标URL发送GET请求
response = requests.get(url)
# 检查请求是否成功
if response.status_code == 200:
# 解析HTML响应内容
soup = BeautifulSoup(response.content, 'html.parser')
# 定位新闻稿内容区域
press_release_content = soup.find('div', {'id': 'divPressRelease'})
# 提取标题与正文内容
title = press_release_content.find('h1').text.strip()
content = press_release_content.find('div', {'class': 'pressreldetail'}).text.strip()
return title, content
else:
print("未能从目标网站获取数据。")
return None, None
# 定义主函数
def main():
# 待抓取的新闻稿页面URL
url = 'https://www.pib.gov.in/PressReleasePage.aspx?PRID=1895315'
# 从目标网站抓取数据
title, content = scrape_website(url)
# 将抓取得到的数据写入CSV文件
if title and content:
with open('scraped_data.csv', 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(['Title', 'Content'])
writer.writerow([title, content])
print("抓取的数据已成功保存至'scraped_data.csv'。")
else:
print("未抓取到任何有效数据。")
提供机构:
zennn077
原始信息汇总
数据集概述
数据来源
- 数据来源于网站:
https://www.pib.gov.in/PressReleasePage.aspx?PRID=1895315
数据内容
- 数据包括新闻稿的标题和内容。
- 标题和内容分别从网页中的
<div id="divPressRelease">和<div class="pressreldetail">标签中提取。
数据存储
- 提取的数据存储在CSV文件
scraped_data.csv中。 - CSV文件包含两列:
Title和Content。



