I've looked at the other answers to this question, and yet it is still not working. I am trying to delete duplicate cases, here is the function:
def deleteDups(datab):
col = db[datab]
pipeline = [
{'$group': {
'_id': {
'CASE NUMBER': '$CASE NUMBER',
'JURISDICTION': '$JURISDICTION'},#needs to be case insensitive
'count': {'$sum': 1},
'ids': {'$push': '$_id'}
}
},
{'$match': {'count': {'$gt': 1}}},
]
results = col.aggregate(pipeline, allowDiskUse = True)
count = 0
for result in results:
doc_count = 0
print(result)
it = iter(result['ids'])
next(it)
for id in it:
deleted = col.delete_one({'_id': id})
count += 1
doc_count += 1
#print("API call recieved:", deleted.acknowledged) debug, is the database recieving requests
print("Total documents deleted:", count)
And yet, every time, I get this traceback:
File "C:\Users\*****\Documents\GitHub\*****\controller.py", line 202, in deleteDups
results = col.aggregate(pipeline, allowDiskUse = True)
File "C:\Python38\lib\site-packages\pymongo\collection.py", line 2375, in aggregate
return self._aggregate(_CollectionAggregationCommand,
File "C:\Python38\lib\site-packages\pymongo\collection.py", line 2297, in _aggregate
return self.__database.client._retryable_read(
File "C:\Python38\lib\site-packages\pymongo\mongo_client.py", line 1464, in _retryable_read
return func(session, server, sock_info, slave_ok)
File "C:\Python38\lib\site-packages\pymongo\aggregation.py", line 136, in get_cursor
result = sock_info.command(
File "C:\Python38\lib\site-packages\pymongo\pool.py", line 603, in command
return command(self.sock, dbname, spec, slave_ok,
File "C:\Python38\lib\site-packages\pymongo\network.py", line 165, in command
helpers._check_command_response(
File "C:\Python38\lib\site-packages\pymongo\helpers.py", line 159, in _check_command_response
raise OperationFailure(msg % errmsg, code, response)
pymongo.errors.OperationFailure: Exceeded memory limit for $group, but didn't allow external sort. Pass allowDiskUse:true to opt in.
I asterisked out bits of path to protect privacy. But it is driving me absolutely nuts that this line: results = col.aggregate(pipeline, allowDiskUse = True) very explicitly passes allowDiskUse = True, and Mongo is just ignoring it. If I misspelled something, I'm blind. True has to be capitalized to pass a bool in python.
I feel like I'm going crazy here.
_tmpfolder