mongodb 在 scrapy 如何去重,然后下载管道如何管理

2018-10-09 15:04:57 +08:00
 Ewig
from scrapy.pipelines.files import FilesPipeline

from scrapy import Request

from scrapy.conf import settings

import pymongo


class XiaoMiQuanPipeLines(object):
def __init__(self):
host = settings["MONGODB_HOST"]
port = settings["MONGODB_PORT"]
dbname = settings["MONGODB_DBNAME"]
sheetname = settings["MONGODB_SHEETNAME"]

client = pymongo.MongoClient(host=host, port=port)

mydb = client[dbname]

self.post = mydb[sheetname]

def process_item(self, item):
url = item['file_url']
name = item['name']

result = self.post.aggregate(
[
{"$group": {"_id": {"url": url, "name": name}}}
]
)
if result:
pass
else:

self.post.insert({"url": url, "name": name})
return item


class DownLoadPipelines(FilesPipeline):

def file_path(self, request, response=None, info=None):
return request.meta.get('filename', '')

def get_media_requests(self, item, info):
file_url = item['file_url']
meta = {'filename': item['name']}
yield Request(url=file_url, meta=meta)


这里写两个管道,先判断,如何重复不下载,如果不重复,写入数据库,然后下载,这里用 aggregate 联合键去重
2777 次点击
所在节点    Python
9 条回复
watsy0007
2018-10-09 16:22:08 +08:00
```python

class MongoCache:
db = None

def __init__(self):
if not hasattr(MongoCache, 'pool'):
MongoCache.create_instance()

@staticmethod
def create_instance():
client = MongoClient(config.MONGO_URL)
MongoCache.db = client['spider']

def create(self, table, unique_key, origin_data):
if self.exists(table, unique_key):
return None

summaries = {k: generator_summary(v) for (k, v) in origin_data.items()}

return self.db[table].insert({
'unique_key': unique_key,
'data': origin_data,
'summaries': summaries
})

def get(self, table, unique_key):
data = self.db[table].find_one({'unique_key': unique_key})
if data is None:
return None
return data['data']

def exists(self, table, unique_key):
data = self.db[table].find_one({'unique_key': unique_key})
return data is not None

def is_changed(self, table, unique_key, origin_data):
if not self.exists(table, unique_key):
return True

last_summaries = self.db[table].find_one({'unique_key': unique_key})['summaries']
for (k, v) in origin_data.items():
summary = generator_summary(v)
last_summary = last_summaries.get(k, None)
# print('{} -> {} | {} -> {}'.format(k, v, summary, last_summary))
if last_summary is None or last_summary != summary:
return True
return False

def change_fields(self, table, unique_key, origin_data):
if not self.exists(table, unique_key):
return origin_data
changes = {}
last_summaries = self.db[table].find_one({'unique_key': unique_key})['summaries']
for (k, v) in origin_data.items():
last_summary = last_summaries.get(k, None)
# print('{} -> {} | {} -> {}'.format(k, v, summary, last_summary))
if last_summary is None or last_summary != generator_summary(v):
changes[k] = v
return changes

def update(self, table, unique_key, origin_data):
if not self.exists(table, unique_key):
return origin_data
new_summaries = {k: generator_summary(v) for (k, v) in origin_data.items()}
self.db[table].update_one({'unique_key': unique_key},
{'$set': {'data': origin_data, 'summaries': new_summaries}})
return origin_data
```
watsy0007
2018-10-09 16:24:07 +08:00
v2ex 不支持 markdown...

https://gist.github.com/watsy0007/779c27fb0ceab283cc434b5eec10b7c4

封装了针对数据处理的公共方法.
picone
2018-10-09 20:47:42 +08:00
我是直接 mongo 加 unique 索引,并捕捉索引冲突异常。。
Ewig
2018-10-12 12:29:00 +08:00
@picone 你的是联合键吗?我说的是 url 和 name 一起
picone
2018-10-12 13:33:01 +08:00
Ewig
2018-10-12 17:31:35 +08:00
@picone db.XiaoMiQuan.find()
{ "_id" : ObjectId("5bbf14dbc96b5b3f5627d11d"), "file_url" : "https://baogaocos.seedsufe.com/2018/07/19/doc_1532004923556.pdf", "name" : "AMCHAM-中国的“一带一路”:对美国企业的影响(英文)-2018.6-8 页.pdf" }我现在是这样写的
这是对的?
pyfrog
2019-01-24 17:37:46 +08:00
@Ewig 用不用把他全站 pdf 发你
Ewig
2019-01-24 18:04:30 +08:00
@pyfrog 人家网站是更新的吧
pyfrog
2019-01-24 18:33:58 +08:00
@Ewig 是啊,直接给你服务器

这是一个专为移动设备优化的页面(即为了让你能够在 Google 搜索结果里秒开这个页面),如果你希望参与 V2EX 社区的讨论,你可以继续到 V2EX 上打开本讨论主题的完整版本。

https://tanronggui.xyz/t/495979

V2EX 是创意工作者们的社区,是一个分享自己正在做的有趣事物、交流想法,可以遇见新朋友甚至新机会的地方。

V2EX is a community of developers, designers and creative people.

© 2021 V2EX