How to Find duplicates files (by name and/or size)

2019-01-23 09:42发布

问题:

Is it possible to create a Google apps script which would select automatically duplicate files inside a specific (current) folder (then I just had to delete them with one click) ?

https://developers.google.com/apps-script/reference/drive/

I am asking this because Google drive pc client create random duplicates files when updating them ! They want to solve this bug but I have to manually select them in each folders and then delete them. It is very annoying and time consuming. https://productforums.google.com/forum/#!category-topic/drive/report-a-problem/_ApOopI-Xkw

Edit: Alternatively, as windows cannot create the same filename inside the same folder, find and put in the trash all duplicate filenames inside a specific main folder (and its each of its related subfolders).

回答1:

For better readability I add the next part of this answer in a second answer.

Here is the code to handle the duplicates : it works in 2 steps :

  1. detect the duplicates and mark them in the sheet to check if everything is fine (markDuplicates())
  2. remove the red background on the files you want to preserve (they will become yellow after step 3 is done)
  3. delete the selected (marked) files, actually move them to trash, you'll have to empty the trash manually from the drive Ui. (trashDuplicates()) The deleted files will be colored in deep red to confirm deletion.

LAST EDIT :

Placed the full code here and made some functions private to avoid errors due to bad script calls.

function startProcess(){
  PropertiesService.getScriptProperties().deleteAllProperties();
  try{
    ScriptApp.deleteTrigger(ScriptApp.getProjectTriggers()[0]);
  }catch(e){}
  var sh = SpreadsheetApp.getActiveSheet();
  sh.getDataRange().clear();
  sh.getRange(1,1,1,4).setValues([['fileName (logged @'+Utilities.formatDate(new Date(),Session.getScriptTimeZone(),'MMM-dd-yyyy HH:mm')+')','fileSize','parent folders tree','fileID']]);
  var trig = ScriptApp.newTrigger('getDriveFilesList_').timeBased().everyMinutes(5).create();
  Logger.log(trig.getUniqueId()+'  '+trig.getHandlerFunction());
  getDriveFilesList_();
}

function getDriveFilesList_(){
  var content = [];
  var startTime = new Date().getTime();
  var sh = SpreadsheetApp.getActiveSheet();
  if( ! PropertiesService.getScriptProperties().getProperty('numberOfFiles')){
    PropertiesService.getScriptProperties().setProperty('numberOfFiles',0);
  }

  var numberOfFiles = Number(PropertiesService.getScriptProperties().getProperty('numberOfFiles'));
  Logger.log(numberOfFiles);
  var max = numberOfFiles+10000;
  if( ! PropertiesService.getScriptProperties().getProperty('continuationToken')){
    var files = DriveApp.getFiles();
   // var files = DriveApp.getFolderById('0B3qSFd_____MTFZMDQ').getFiles();// use this line and comment the above if you want to process a single folder
   // use your chozen folder ID of course (available from the browser url , the part after "https://drive.google.com/?authuser=0#folders/")
  }else{
    var files = DriveApp.continueFileIterator(PropertiesService.getScriptProperties().getProperty('continuationToken'))
    }
  while(files.hasNext() && numberOfFiles<(max)){
    var file = files.next()
    if(file.getSize()>0){
      numberOfFiles++;
      var folder = '(shared)';
      if(file.getParents().hasNext()){folder = getTree_(file)}
      content.push([file.getName(),file.getSize(),folder,file.getId()])
    }    
    if(new Date().getTime()-startTime > 250000){break};
  }
  sh.getRange(sh.getLastRow()+1,1,content.length,content[0].length).setValues(content);
  if(!files.hasNext()){ScriptApp.deleteTrigger(ScriptApp.getProjectTriggers()[0]);Logger.log('done !'); sh.getRange(sh.getLastRow()+1,1).setValue('All files processed ('+numberOfFiles+' found)')};
  var continuationToken = files.getContinuationToken()
  PropertiesService.getScriptProperties().setProperty('numberOfFiles',numberOfFiles);
  PropertiesService.getScriptProperties().setProperty('continuationToken',continuationToken);
}

function markDuplicates(){
  handleDuplicates_(false)
}

function trashDuplicates(){
  handleDuplicates_(true)
}

function handleDuplicates_(trash){  
  var sh = SpreadsheetApp.getActiveSheet();
  sh.setFrozenRows(1);
  sh.sort(1);
  var data = sh.getDataRange().getValues()
  var headers = data.shift()
  var lastComment = data.pop();
  var toDelete = [];
  var item = data[0];
  for(var n=1 ; n<data.length; n++){
    if(data[n][0]==item[0] && data[n][1]==item[1]){
      toDelete.push('delete '+n);
    }
    item=data[n];
  }
  var marker =   sh.getRange(2,1,data.length,1).getBackgrounds();
  for(var n in data){
    if(!trash){marker.push(['#FFF'])};
    if(toDelete.indexOf('delete '+n)>-1 && !trash){
      marker[n][0] = '#F99';
    }
    if(toDelete.indexOf('delete '+n)>-1 && trash){
      if(marker[n][0]=='#ff9999'){
        try{
        DriveApp.getFileById(data[n][3]).setTrashed(trash);
        marker[n][0] = '#F33';
        }catch(err){Logger.log(err)}
      }else{
        marker[n][0] = '#FF9';
      }
    }
  }
  sh.getRange(2,1,marker.length,1).setBackgrounds(marker);
}

function getTree_(file){
  var tree = [];
  var folderP = file.getParents()
  while (folderP.hasNext()){
    var folder = folderP.next();
    folderP = folder.getParents();
    tree.push(folder.getName());
  }
  return tree.reverse().join('/');
}


回答2:

This is not going to be simple... DriveApp is not particularly fast and one have usually many files in Drive so a script won't be able to iterate all the files in one single shot.

You will have to process your files in batch and set a time trigger to continue automatically until all the files are processed.

Here is a script that shows how to iterate in small 100 items batches (in real life you can process more than 100, I limit to a small number for demo purpose, just change the value to a more realistic value when you have a realistic idea of how long it takes to execute, see comments in code).

I left aside the duplicate detection because I wasn't sure if all the files had to be examined (Google docs have size=0 so it made me think you wanted to work only on other format files but I wasn't sure) but this part shouldn't be too hard to deal with.

Here is the iteration code, it shows the filenames and size in the logger, along with the global counter :

function searchDupInDrive(){
  if( ! PropertiesService.getScriptProperties().getProperty('numberOfFiles')){PropertiesService.getScriptProperties().setProperty('numberOfFiles',0)};
  var numberOfFiles = Number(PropertiesService.getScriptProperties().getProperty('numberOfFiles'));
  Logger.log(numberOfFiles);
  var max = numberOfFiles+100;// use an appropriate value here so that execution time remains < 5 minutes
  if(numberOfFiles >2000){Logger.log('done !');return}; // this is to limit the demo to a short total execution time
  if( ! PropertiesService.getScriptProperties().getProperty('continuationToken')){
    var files = DriveApp.getFiles();
  }else{
    var files = DriveApp.continueFileIterator(PropertiesService.getScriptProperties().getProperty('continuationToken'))
    }
  while(files.hasNext() && numberOfFiles<(max)){
    var file = files.next()
    Logger.log((numberOfFiles)+' : fileName = '+file.getName()+'  size = '+file.getSize())
    numberOfFiles++;

    // set conditions, store file Names and sizes to be able to compare

  }
  var continuationToken = files.getContinuationToken()
  PropertiesService.getScriptProperties().setProperty('numberOfFiles',numberOfFiles);
  PropertiesService.getScriptProperties().setProperty('continuationToken',continuationToken);
}

Note : to achieve duplication detection you will probably have to store the criteria, file names and sizes somewhere between 2 script runs, I guess script properties would be useable for that too.

Please update here when you have a final result.

Note 2 : I read the thread about this issue : quite horrifying ! I'm glad I'm on Mac OS, no problem like that (for now :-)


EDIT

Here is a second version. In the comments you told me that you added the script to a spreadsheet so it gave me the idea to store the file list in a spreadsheet so that, in a second step, we can sort the files and see /select the duplicates more easily.

By the way, I noticed that the driveApp was faster then I thought, not even sure we have to process in batch... nevertheless I limited the execution time to less than 5 minutes and implemented the script trigger creation / suppression just in case you have a lot of files...

Here is the new code, I still left the duplicate detection aside, you'll find very easily a lot of code examples to select the duplicates in a spreadsheet... this first step of getting all the files was the most non obvious.

Now there are 2 functions, one to start the process (clear the sheet, create the trigger, set a title to columns) and the actual drive reading one (It takes only the files that are NOT google docs, ie files with size>0):

function startProcess(){
  PropertiesService.getScriptProperties().deleteAllProperties();
  var sh = SpreadsheetApp.getActiveSheet();
  sh.getDataRange().clear();
  sh.getRange(1,1,1,4).setValues([['fileName (logged @'+Utilities.formatDate(new Date(),Session.getScriptTimeZone(),'MMM-dd-yyyy HH:mm')+')','fileSize','parent folder','fileID']]);
  var trig = ScriptApp.newTrigger('getDriveFilesList').timeBased().everyMinutes(5).create();
  Logger.log(trig.getUniqueId()+'  '+trig.getHandlerFunction());
  getDriveFilesList();
}

function getDriveFilesList(){
  var content = [];
  var startTime = new Date().getTime();
  var sh = SpreadsheetApp.getActiveSheet();
  if( ! PropertiesService.getScriptProperties().getProperty('numberOfFiles')){
    PropertiesService.getScriptProperties().setProperty('numberOfFiles',0);
  }

  var numberOfFiles = Number(PropertiesService.getScriptProperties().getProperty('numberOfFiles'));
  Logger.log(numberOfFiles);
  var max = numberOfFiles+10000;
  if( ! PropertiesService.getScriptProperties().getProperty('continuationToken')){
    var files = DriveApp.getFiles();
  }else{
    var files = DriveApp.continueFileIterator(PropertiesService.getScriptProperties().getProperty('continuationToken'))
    }
  while(files.hasNext() && numberOfFiles<(max)){
    var file = files.next()
    if(file.getSize()>0){
      numberOfFiles++;
      var folder = '(shared)';
      if(file.getParents().hasNext()){folder = file.getParents().next().getName()}
      content.push([file.getName(),file.getSize(),folder,file.getId()])
    }    
    if(new Date().getTime()-startTime > 250000){break};
  }
  sh.getRange(sh.getLastRow()+1,1,content.length,content[0].length).setValues(content);
  if(!files.hasNext()){ScriptApp.deleteTrigger(ScriptApp.getProjectTriggers()[0]);Logger.log('done !'); sh.getRange(sh.getLastRow()+1,1).setValue('All files processed ('+numberOfFiles+' found)')};
  var continuationToken = files.getContinuationToken()
  PropertiesService.getScriptProperties().setProperty('numberOfFiles',numberOfFiles);
  PropertiesService.getScriptProperties().setProperty('continuationToken',continuationToken);
}