You are on page 1of 1

import sys

import os
import filecmp
from hashlib import sha512
from scandir import scandir
class Pair:
def __init__(self, hash, path):
self.hash = hash
self.path = path
def __eq__(self, other):
return self.hash == other.hash and filecmp.cmp(self.path, other.path)
def __str__(self):
return self.path
def get_hash_of_the_file(file):
hasher = sha512()
with open(file, "rb") as f:
chunk_size = 4096
chunk = f.read(chunk_size)
while chunk:
hasher.update(chunk)
chunk = f.read(chunk_size)
return hasher.digest()
def recursive_searching_for_files(curdir, founded_files):
for entry in scandir(curdir):
path = curdir + os.sep + entry.name
if (entry.is_symlink() or entry.is_file()) and (entry.name[0] == '.' or
entry.name[0] == '~'):
continue
if (entry.is_dir()):
recursive_searching_for_files(path, founded_files)
else:
founded_files.append(Pair(get_hash_of_the_file(path), path))
def main():
if len(sys.argv) != 2:
print('usage: ./duplicates.py directory')
sys.exit(1)
founded_files = []
recursive_searching_for_files(sys.argv[1], founded_files)
equivalent_classes = []
for cur in founded_files:
founded = False
for cls in equivalent_classes:
if cls[0] == cur:
founded = True
cls.append(cur)
break
if not founded:
equivalent_classes.append([cur])
for cls in equivalent_classes:
if len(cls) == 1:
continue
print(*cls, sep=':')
if __name__ == '__main__':
main()

You might also like