Introduction
MongoDB限制单个document不能超过16M,对于更大的数据MongoDB提供了GridFS类文件系统的存储方式。想了解GridFS实现原理请阅读GridFS官方文档,非本文重点不做额外介绍。
对于读写数据库这种IO占比较大的工作,异步读写是一个提升效率的非常高效的途径。Motor是MongoDB官方提供的python异步读写库,因此本文旨在介绍如何基于motor实现GridFS的CRUD。
官方文档永远是你的好朋友:MongoDB手册: Motor页, Motor官方文档
Realization
Connection
代码如下:
from motor.motor_asyncio import AsyncIOMotorClient, AsyncIOMotorGridFSBucket
client = AsyncIOMotorClient('mongodb_url') # create an async client
db = client['db_name'] # get db instance
fs = AsyncIOMotorGridFSBucket(my_db) # get async gridfs instance
接下来就可以用fs进行读写操作了
C
这里给出一个插入多个文档的forloop实现:
async def insert_many(fs: AsyncIOMotorGridFSBucket, data: List[Dict]):
"""
data example:
[{"filename": 'x', "metadata": {"version": 0}, "data": b"hello gridfs"}, ...]
"""
ids = []
for i in data:
ids.append(
await fs.upload_from_stream(filename=i['filename'], source=i['data'], metadata=i.get('metadata')))
return ids
fs.upload_from_stream
参数说明:
filename
是gridfs文档的必须要素metadata
是用户自定义的元信息,可以忽略source
可以是二进制字符串,也可以是file-like object,函数支持自动从文件中读取并上传
R
异步读gridfs根据场景不同,可有多种不同实现方式:
直接保存至文件
代码:
f = open('local_path', 'wb')
await fs.download_to_stream(file_id, f) # 通过gridfs文件id下载
await fs.download_to_stream_by_name("test_file", f) # 通过gridfs文件名下载
函数说明:
coroutine download_to_stream(file_id: Any, destination: Any, session: Optional[pymongo.client_session.ClientSession] = None) → None:
"""
Params:
file_id: The _id of the file to be downloaded.
destination: a file-like object implementing write().
session (optional): a ClientSession, created with start_session().
Raise:
raise NoFile if no file with file_id exists.
"""
coroutine download_to_stream_by_name(filename: str, destination: Any, revision: int = - 1, session: Optional[pymongo.client_session.ClientSession] = None) → None:
"""
Params:
filename: The name of the file to read from.
revision (optional): Which revision (documents with the same filename and different uploadDate) of the file to retrieve. Defaults to -1 (the most recent revision). ex: 0 = the original stored file, n = the nth revision, -n = the nth most recent revision
"""
读取文件内容至字符串
代码:
grid_out = await fs.open_download_stream(file_id) # 通过gridfs文件id读取
grid_out = await fs.open_download_stream_by_name(file_id) # 通过gridfs文件名读取
contents = await grid_out.read()
函数说明(参数解释参考download_to_stream
及download_to_stream_by_name
):
coroutine async open_download_stream(file_id: Any, session: Optional[pymongo.client_session.ClientSession] = None) → gridfs.grid_file.GridOut
coroutine async open_download_stream_by_name(filename: str, revision: int = - 1, session: Optional[pymongo.client_session.ClientSession] = None) → gridfs.grid_file.GridOut
按条件查找文件
代码:
async def query(fs: AsyncIOMotorGridFSBucket, query_str: dict)
"""
query_str example: {"filename": 'x', "metadata.version": 0}, 用户自定义查询条件,格式参考mongo query
"""
cursor = fs.find(query_str, no_cursor_timeout=True)
data = []
while (await cursor.fetch_next):
grid_out = cursor.next_object()
## 通过grid_out获取文件元信息
# fileid = grid_out._id
# filename = grid_out.filename
# metadata = grid_out.metadata
data.append(await grid_out.read())
return data
find可拓展参数:
async def find(*args, **kwargs):
"""
filter: Search query.
batch_size (optional): The number of documents to return per batch.
limit (optional): The maximum number of documents to return.
no_cursor_timeout (optional): The server normally times out idle cursors after an inactivity period (10 minutes) to prevent excess memory use. Set this option to True prevent that.
skip (optional): The number of documents to skip before returning.
sort (optional): The order by which to sort results. Defaults to None.
session (optional): a ClientSession, created with start_session()
"""
...
U
主要逻辑就是先删除再新建,代码:
async def update(fs: AsyncIOMotorGridFSBucket, query_str: dict, update_data):
cursor = fs.find(query_str, no_cursor_timeout=True)
id_collection = []
while (await cursor.fetch_next):
grid_out = cursor.next_object()
id_collection.append((grid_out._id, grid_out.filename, grid_out.metadata))
if len(id_collection) == 0:
raise Exception("No file matched for query condition.")
elif len(id_collection) > 1:
raise Exception("More than one file matched for query condition.")
else:
fs_id, filename, metadata = id_collection[0]
await fs.delete(fs_id)
new_fs_id = await fs.upload_from_stream(filename=filename, source=update_data, metadata=metadata)
return new_fs_id
D
AsyncIOMotorGridFSBucket.delete(fileid)
是官方提供的删除接口,但是只支持通过文件id删除,下面提供一个根据搜索条件进行删除实现:
async def delete(fs: AsyncIOMotorGridFSBucket, query_str: dict):
cursor = fs.find(query_str, no_cursor_timeout=True)
id_collection = []
while (await cursor.fetch_next):
grid_out = cursor.next_object()
id_collection.append(grid_out._id)
await fs.delete(grid_out._id)
return id_collection