Here you can find a script for deduplicating files based on their hash. It's written in PowerShell and compares files based on their hash values.
Examples
Here are examples of how to use the script in Windows 10 and macOS.
Single location:
Windows: .\deduplicate.ps1 -paths "d:\" -showGui true
macOS: .\deduplicate.ps1 -paths "/Users/temp/Downloads" -showGui false
Multiple locations:
Windows: .\deduplicate.ps1 -paths "d:\","e:\" -showGui true
macOS: .\deduplicate.ps1 -paths "/Users/temp/Downloads","/Users/temp" -showGui false
Outputs:
Known files ignored: 10 | Sum of known ignored files: 3 GiB | Files hashed: 88 | Sum of hashed files: 10 GiB | Duplicates found: 3 | Sum of duplicate files: 535 MiB
Available variables after running:$global:ignoreds, $global:fileHashes, $global:duplicates
Source Code
<#
.SYNOPSIS
Scans one or more folders for duplicates
.DESCRIPTION
Scans one or more Folders. And calculates an Hash for the found files.
This Hash is comparated with earlier Hashes.
If the hash is the same the file is double.
.PARAMETER paths
One ore more paths can begiven. i.e. "\\server\share","d:\"
.PARAMETER showGui
You can set the visibilty of the interface. i.e. true or false.
.PARAMETER hashType(optional)
You can set the hashing type. Default is "MD5"
.PARAMETER unitTest(optional)
You can to choose between script or unit test. i.e. true or false. Default is false
.INPUTS
One ore more paths
.OUTPUTS
Files ignored 10
Sum of ignored files 3 Gib
Files hashed 88
Sum of hased files 10 Gib
Duplicates found 3
Sum of duplicate files 535 Mib
.EXAMPLE
Windows:
.\deduplicate.ps1 -paths "d:\" -showGui true
Apple:
.\deduplicate.ps1 -paths "/Users" -showGui false
.EXAMPLE
Windows:
.\deduplicate.ps1 -paths "d:\","e:\"-showGui true
Apple:
.\deduplicate.ps1 -paths "/Users/temp/Downloads","/Users/temp" -showGui false
.NOTES
Author: Antoine Engelen
Version: 0.0.3
Status: CONCEPT
Changelog:
0.0.1 Initial Release
0.0.2 Paths and files with wildcard brackets in there name wil be excluded.
0.0.3 Added unit test. And started to move methods to classes.
.LINK
https://gitlab.engelenathome.info/Powershell/Deduplicate
#>
#requires -Version 5.1
[CmdletBinding()]
param (
[Parameter(Mandatory = $true)]
[Array]$paths = (Read-Host 'Give one or more accesible path like [c:\] or [("c:\","d:\")]'),
[Parameter(Mandatory = $true)]
[String]$showGui = (Read-Host 'In Windows show graphical interface [true] or [false]'),
[Parameter(Mandatory = $false)]
[String]$hashType = "MD5",
[Parameter(Mandatory = $false)]
[String]$unitTest = $false
)
# Install-Module -Name Pester -Force
class stopWatch {
[datetime]$Time1
[datetime]$Time2
[String]$Difference
[void]StartTime () {
$this.Time1 = Get-Date -format HH:mm:ss.fff
}
[void]EndTime () {
$this.Time2 = Get-Date -format HH:mm:ss.fff
}
[void]DifferenceBetweenTimes() {
$TimeDiff = New-TimeSpan $this.Time1 $this.Time2
$Hrs = $TimeDiff.Hours
$Mins = $TimeDiff.Minutes
$Secs = $TimeDiff.Seconds
$Mill = $TimeDiff.Milliseconds
$this.Difference = '{0:00}:{1:00}:{2:00}.{3:000}' -f $Hrs, $Mins, $Secs, $Mill
}
}
class systemTaskes {
[void]ProgressBar($id, $current, $max, $text) {
if ($current + 1 -ge $max) {
Write-Progress -Id $id -Activity Hashing -Completed
}
else {
Write-Progress -Id $id -Activity Hashing -Status 'Progress->' -PercentComplete (($current / $max) * 100) -CurrentOperation $text
}
}
}
class deduplicateFactory {
}
function Get-DetermineHardwareArchitecture {
if ((Get-ChildItem -Path Env:) -eq $IsWindows) {
#Determine hardware architecture.
# 32-bit = [IntPtr]::Size -eq 4
# 64-bit = [IntPrt]::Size -eq 8
if ([IntPtr]::Size -eq 4) {
Write-Verbose "$(Get-Date): Hardware architecture 32-bit"
return "32-bit"
}
elseif ([IntPtr]::Size -eq 8) {
Write-Verbose "$(Get-Date): Hardware architecture 64-bit"
return "64-bit"
}
else {
Write-Verbose "$(Get-Date): Hardware architecture Unknown"
return "Unknown"
}
}
}
function Get-CheckDuplicate {
[CmdletBinding()]
param (
[Parameter(Mandatory = $true)]
[String]$fileFullName,
[Parameter(Mandatory = $true)]
[Int64]$fileLength,
[Parameter(Mandatory = $false)]
[String]$hash
)
if ($hash -ne $null) {
$global:fileHashes | Where-Object { $_[2] -eq $fileLength -and $_[1] -eq $hash } | ForEach-Object {
if ($global:fileHashes[$global:fileHashes.IndexOf($_)][1] -eq $hash) {
$global:duplicates += @(, @($global:fileHashes[$global:fileHashes.IndexOf($_)][0], $fileFullName, $fileLength))
return
}
}
$hash = "[Unknown]"
}
$global:fileHashes += @(, @($fileFullName, $hash, $fileLength))
}
function Get-FileContent {
[CmdletBinding()]
param (
[Parameter(Mandatory = $true)]
[Object]$folderContent
)
$folderContent | ForEach-Object {
$stopWatchFileHash.StartTime()
if (Test-Path -IsValid -Path "$($_.FullName)" -ErrorAction SilentlyContinue) {
if ($_.length -gt 0) {
Try {
$thisFile = Get-FileHash -Algorithm $hashType -Path "$($_.FullName)"
$stopWatchFileHash.EndTime()
}
Catch [System.UnauthorizedAccessException] {
Write-Verbose "$(Get-Date) Warning: Unauthorized Access Exception"
$global:accessDenied += @(, @("$($_.FullName)"))
}
Catch {
Write-Verbose "$(Get-Date) Warning: An unforseen error has happend."
Write-Verbose "$(Get-Date) Warning: During the hash routine."
Write-Verbose "$(Get-Date) Warning: Create message: $CreateMsg"
Write-Verbose "$(Get-Date) Warning: Last error: $($Error[0])"
Exit
}
$stopWatchFileHash.DifferenceBetweenTimes()
$stopWatchCheckDuplicate.StartTime()
Get-CheckDuplicate -fileFullName $_.FullName -fileLength $_.length -hash $thisFile.hash
$stopWatchCheckDuplicate.EndTime()
$stopWatchCheckDuplicate.DifferenceBetweenTimes()
$global:sytemTaskes.ProgressBar(2, $folderContent.IndexOf($_), $folderContent.Count, "($($folderContent.IndexOf($_))/$($folderContent.Count)) Hashing: $($stopWatchFileHash.Difference) Deduplicatng: $($stopWatchCheckDuplicate.Difference) HASH: $($thisFile.Hash) Size: $(Get-CoverToByteSize $_.length) Name: $($_.Name)")
}
else {
$global:ignoreds += @(, @($_.FullName, $_.length))
}
}
else {
try {
Get-Item -Path $_.FullName -ErrorAction Stop -ErrorVariable GIError
}
Catch [System.UnauthorizedAccessException] {
$global:accessDenied += @(, @($_.FullName))
}
Catch {
Write-Warning "An unforseen error has happend."
Write-Warning "During the hash routine."
Write-Warning "Last error: $($GIError.Exception)"
}
}
}
}
function Get-FolderContent {
[CmdletBinding()]
param (
[Parameter(Mandatory = $true)]
[String]$path
)
Write-Verbose "$(Get-Date): Getting content of folder $($_)"
return (Get-ChildItem -Path $path -File -Recurse -ErrorAction SilentlyContinue)
}
function Get-CoverToByteSize {
[CmdletBinding()]
param (
[Parameter(Mandatory = $true)]
[Int64]$size
)
if ($size -ge 0) { $result = "$([math]::Round($size)) $($global:BytePrefixNames[0])" }
for ($counter = 1; $counter -le 8; $counter++) {
if ($size -ge [Math]::Pow(1024, $counter)) { $result = "$([math]::Round($size / [Math]::Pow(1024, $counter))) $($global:BytePrefixNames[$counter])" }
}
return $result
}
function Get-DuplicatesFromPaths {
[CmdletBinding()]
Param (
[Parameter(Mandatory = $true)]
[Array]$paths
)
$paths | ForEach-Object {
if (Test-Path $_) {
$global:sytemTaskes.ProgressBar(1, $paths.IndexOf($_), $paths.Count, "($($paths.IndexOf($_)+1)/$($paths.Count)) path $($_)")
Get-FileContent -folderContent (Get-FolderContent -path $_)
}
else {
Write-Verbose "$(Get-Date): Folder $($_) doesn't exist"
}
}
}
function Get-DuplicatesExportToJson {
$global:duplicates | ConvertTo-Json | Set-Content -path .\export.json
}
function Get-RemoveExportJson {
if (Test-Path -Path .\export.json ) {
Remove-Item .\export.json
}
}
function Get-Information {
$global:ignoreds | ForEach-Object { $sizeIgnored += $($_[1]) }
$global:fileHashes | ForEach-Object { $sizeHased += $($_[2]) }
$global:duplicates | ForEach-Object { $sizeDuplicate += $($_[2]) }
if (![System.Convert]::ToBoolean($($showGui))) {
Write-Host "$(Get-Date): Acces denied files $($global:accessDenied.Count)"
Write-Host "$(Get-Date): Known files ignored $($global:ignoreds.Count)"
Write-Host "$(Get-Date): Sum of known ignored files $(Get-CoverToByteSize $sizeIgnored)"
Write-Host "$(Get-Date): Files hashed $($global:fileHashes.Count)"
Write-Host "$(Get-Date): Sum of hashed files $(Get-CoverToByteSize $sizeHased)"
Write-Host "$(Get-Date): Duplicates found $($global:duplicates.Count)"
Write-Host "$(Get-Date): Sum of duplicate files $(Get-CoverToByteSize $sizeDuplicate)"
}
else {
$text = $null;
$text += "Acces denied files: $($global:accessDenied.Count)`n"
$text += "Known files ignored: $($global:ignoreds.Count)`n"
$text += "Sum of known ignored files: $(Get-CoverToByteSize $sizeIgnored)`n"
$text += "Files hashed: $($global:fileHashes.Count)`n"
$text += "Sum of hashed files: $(Get-CoverToByteSize $sizeHased)`n"
$text += "Duplicates found: $($global:duplicates.Count)`n"
$text += "Sum of duplicate files: $(Get-CoverToByteSize $sizeDuplicate)`n"
New-DrawWindow -text $text
}
}
function DoExec {
[CmdletBinding()]
Param (
[Parameter(Mandatory = $true)]
[Object]$objForm
)
$objForm.Close()
}
function Get-ApplicationIcon {
$PF32bit = ${Env:WinDir} + "/System32/"
$PF64bit = ${Env:WinDir} + "/SysWOW64/"
Switch (Get-DetermineHardwareArchitecture) {
"32-bit" {
return [system.drawing.icon]::ExtractAssociatedIcon(“$PF32bit\WindowsPowerShell\v1.0\powershell.exe”)
}
"64-bit" {
return [system.drawing.icon]::ExtractAssociatedIcon(“$PF64bit\WindowsPowerShell\v1.0\powershell.exe”)
}
Default {
return $null
}
}
}
function New-DrawWindow {
[CmdletBinding()]
Param (
[Parameter(Mandatory = $true)]
[String]$text
)
[void] [System.Reflection.Assembly]::LoadWithPartialName(“System.Drawing”)
[void] [System.Reflection.Assembly]::LoadWithPartialName(“System.Windows.Forms”)
#Setting constants for GUI.
$F_Width = 400
$F_Height = 180
$B_PosX = 200
$B_PosY = 100
$B_Width = 75
$B_Height = 23
$objForm = New-Object System.Windows.Forms.Form
$objForm.Text = “Powershell - Deduplicate”
$objForm.Size = New-Object System.Drawing.Size($F_Width, $F_Height)
$objForm.StartPosition = “CenterScreen”
$objForm.Icon = Get-ApplicationIcon
$objForm.KeyPreview = $true
$objForm.Add_KeyDown( { if ($_.KeyCode -eq “Enter”) { DoExec($objForm) } })
$objForm.Add_KeyDown( { if ($_.KeyCode -eq “Escape”) { DoExec($objForm) } })
$OKButton = New-Object System.Windows.Forms.Button
$OKButton.Location = New-Object System.Drawing.Size($B_PosX, $B_PosY)
$OKButton.Size = New-Object System.Drawing.Size($B_Width, $B_Height)
$OKButton.Text = “OK”
$OKButton.Add_Click( { DoExec($objForm) })
$objForm.Controls.Add($OKButton)
$CancelButton = New-Object System.Windows.Forms.Button
$CancelButton.Location = New-Object System.Drawing.Size(($B_PosX + $B_Width), $B_PosY)
$CancelButton.Size = New-Object System.Drawing.Size($B_Width, $B_Height)
$CancelButton.Text = “Cancel”
$CancelButton.Add_Click( { DoExec($objForm) })
$objForm.Controls.Add($CancelButton)
$objLabel = New-Object System.Windows.Forms.Label
$objLabel.Location = New-Object System.Drawing.Size(20, 20)
$objLabel.Size = New-Object System.Drawing.Size($F_Width, 250)
$objLabel.Text = $text
$objForm.Controls.Add($objLabel)
$objForm.Topmost = $true
$objForm.Add_Shown( { $objForm.Activate() })
[void] $objForm.ShowDialog()
$DrawTextBox
}
#Setting global variables.
[Array]$global:duplicates = @()
[Array]$global:fileHashes = @()
[Array]$global:ignoreds = @()
[Array]$global:accessDenied = @()
[Array]$global:BytePrefixNames = ("bytes", "Kib", "Mib", "Gib", "Tib", "Pib", "Eib", "Zib", "Yib")
[stopWatch]$global:stopWatchFileHash = [stopWatch]::New()
[stopWatch]$global:stopWatchCheckDuplicate = [stopWatch]::New()
[systemTaskes]$global:sytemTaskes = [systemTaskes]::New()
Clear-Host
Write-Host "$(Get-Date): Deduplicate Script begins"
if ([System.Convert]::ToBoolean($($unitTest))) {
# Pester tests
Describe 'Unit-test' {
for ($loopCounter=0; $loopCounter -lt 10; $loopCounter++) {
$randomValue = Get-Random -Maximum 9
It "Stopwatch should register close to $randomValue seconds" {
$global:stopWatchFileHash.StartTime()
sleep $randomValue
$global:stopWatchFileHash.EndTime()
$global:stopWatchFileHash.DifferenceBetweenTimes()
$global:stopWatchFileHash.Difference | Should -BeLike ("00:00:0$($randomValue).0*")
}
}
It "There should be atleast 9 byte prefix names" {
$BytePrefixNames.Count | Should -BeGreaterThan 8
}
# It 'All files should be proccesed' {
# $paths | ForEach-Object {
# $totalFileCount += $paths.IndexOf($_).Count
# }
# $fileHashes.Count + $ignoreds.Count + $accessDenied.count + 1 | Should -HaveCount $totalFileCount
# }
"Hello world!!!" | Set-Content -path .\export.json
It 'Removes Export.json' {
Mock -CommandName Remove-Item -MockWith { }
Get-RemoveExportJson
Assert-MockCalled -CommandName Remove-Item -Times 1 -Exactly
}
if (Test-Path -Path .\export.json ) {
Remove-Item .\export.json -Force
}
$global:duplicates = "hello world!!"
It 'Export.json should exists in the script folder' {
Mock -CommandName Set-Content -MockWith { }
Get-DuplicatesExportToJson
Assert-MockCalled -CommandName Set-Content -Times 1 -Exactly
}
}
}
else {
Write-Host "$(Get-Date): Getting content from given path(s)."
Write-Host "$(Get-Date): This may take awhile dependig on size of the filestructure."
Get-RemoveExportJson
Get-DuplicatesFromPaths -paths $paths
Get-Information
Get-DuplicatesExportToJson
}
Write-Host "$(Get-Date): Deduplicate Script completed!"Tips
- This script has only been tested on Windows 10.
- Requires PowerShell 5.1.
- UNC paths also work as input values.
- You may need to run:
Set-ExecutionPolicy -ExecutionPolicy RemoteSigned
License
MIT
Copyright (c) 2019 Antoine Engelen
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction.