今天想爬点 nytimes 的新闻来看,爬这个地址的时候,每条消息的 date 爬取不到。 https://www.nytimes.com/section/politics
我加了 scrapy-playwright 还是一样的。求爬虫大声指点一下。
这是爬虫代码
import scrapy
from my_spider.items import MySpiderItem
class Mypider(scrapy.Spider):
name = "myspider"
allowed_domains = ["nytimes.com"]
start_urls = ["https://www.nytimes.com/section/politics"]
def start_requests(self):
for url in self.start_urls:
# 使用 PlaywrightRequest 来加载动态内容
# GET request
yield scrapy.Request(url, meta={"playwright": True})
# POST request
yield scrapy.FormRequest(
url="https://httpbin.org/post",
formdata={"foo": "bar"},
meta={"playwright": True},
)
def parse(self, response):
for article in response.css('.css-18yolpw'):
item = MySpiderItem()
item["title"] = article.css('div:nth-child(1) > article:nth-child(1) > a:nth-child(2) > h3:nth-child(1)::text').get()
t = article.xpath('div/article/a/h3/text()').get()
item["date"] = article.css('div:nth-child(1) > div:nth-child(2) > span:nth-child(1)::text').get()
d = article.xpath('div/div/span/text()').get()
item["url"] = response.urljoin(article.css('div:nth-child(1) > article:nth-child(1) > a:nth-child(2)::attr(href)').get())
item["claim"] = article.css('div:nth-child(1) > article:nth-child(1) > p:nth-child(3)::text').get()
item["rating"] = "True"
item["site"] = "NYTimes"
item["tag"] = "NYTimes"
yield item
d 的值都是"\u00a0"
1
alabrala 19 小时 40 分钟前 1
|