WINDOWS POWER SHELL - DEDUPLICATE

Ingediend door antoine op

Hier kun je een script vinden om bestanden te dedupliceren op basis van hun hash. Het is geschreven in PowerShell en vergelijkt bestanden op basis van hun hashwaarden.

Voorbeelden

Hier zijn voorbeelden van hoe je het script kunt gebruiken in Windows 10 en macOS.

Enkele locatie:

Windows: .\deduplicate.ps1 -paths "d:\" -showGui true
macOS: .\deduplicate.ps1 -paths "/Users/temp/Downloads" -showGui false

Meerdere locaties:

Windows: .\deduplicate.ps1 -paths "d:\","e:\" -showGui true
macOS: .\deduplicate.ps1 -paths "/Users/temp/Downloads","/Users/temp" -showGui false

Uitvoer:

Known files ignored: 10  |  Sum of known ignored files: 3 GiB  |  Files hashed: 88  |  Sum of hashed files: 10 GiB  |  Duplicates found: 3  |  Sum of duplicate files: 535 MiB 

Beschikbare variabelen na uitvoering:
$global:ignoreds, $global:fileHashes, $global:duplicates

Bron Code
<#
.SYNOPSIS
	Scans one or more folders for duplicates
.DESCRIPTION
	Scans one or more Folders. And calculates an Hash for the found files.
	This Hash is comparated with earlier Hashes.
	If the hash is the same the file is double.
.PARAMETER paths
	One ore more paths can begiven. i.e. "\\server\share","d:\"
.PARAMETER showGui
	You can set the visibilty of the interface. i.e. true or false.
.PARAMETER hashType(optional)
	You can set the hashing type. Default is "MD5"
.PARAMETER unitTest(optional)
	You can to choose between script or unit test. i.e. true or false. Default is false
.INPUTS
	One ore more paths
.OUTPUTS
	Files ignored  10
	Sum of ignored files 3 Gib
	Files hashed  88
	Sum of hased files 10 Gib
	Duplicates found  3
	Sum of duplicate files 535 Mib
.EXAMPLE
	Windows:
		.\deduplicate.ps1 -paths "d:\" -showGui true
	Apple:
		.\deduplicate.ps1 -paths "/Users" -showGui false
.EXAMPLE
	Windows:
		.\deduplicate.ps1 -paths "d:\","e:\"-showGui true
	Apple:
		.\deduplicate.ps1 -paths "/Users/temp/Downloads","/Users/temp" -showGui false
.NOTES
	Author:             Antoine Engelen
	Version:            0.0.3
	Status:             CONCEPT
		
	Changelog:
		0.0.1           Initial Release
		0.0.2           Paths and files with wildcard brackets in there name wil be excluded.
		0.0.3           Added unit test. And started to move methods to classes.
.LINK
	https://gitlab.engelenathome.info/Powershell/Deduplicate
#>
#requires -Version 5.1
[CmdletBinding()]
param (
	[Parameter(Mandatory = $true)]
	[Array]$paths = (Read-Host 'Give one or more accesible path like [c:\] or [("c:\","d:\")]'),
	[Parameter(Mandatory = $true)]
	[String]$showGui = (Read-Host 'In Windows show graphical interface [true] or [false]'),
	[Parameter(Mandatory = $false)]
	[String]$hashType = "MD5",
	[Parameter(Mandatory = $false)]
	[String]$unitTest = $false
)

# Install-Module -Name Pester -Force

class stopWatch {
	[datetime]$Time1
	[datetime]$Time2
	[String]$Difference
	
	[void]StartTime () {
		$this.Time1 = Get-Date -format HH:mm:ss.fff
	}

	[void]EndTime () {
		$this.Time2 = Get-Date -format HH:mm:ss.fff
	}

	[void]DifferenceBetweenTimes() {
		$TimeDiff = New-TimeSpan $this.Time1 $this.Time2
		$Hrs = $TimeDiff.Hours
		$Mins = $TimeDiff.Minutes
		$Secs = $TimeDiff.Seconds
		$Mill = $TimeDiff.Milliseconds
		$this.Difference = '{0:00}:{1:00}:{2:00}.{3:000}' -f $Hrs, $Mins, $Secs, $Mill
	}
}

class systemTaskes {
	[void]ProgressBar($id, $current, $max, $text) {
		if ($current + 1 -ge $max) {
			Write-Progress -Id $id -Activity Hashing -Completed
		}
		else {
			Write-Progress -Id $id -Activity Hashing -Status 'Progress->' -PercentComplete (($current / $max) * 100) -CurrentOperation $text
		}
	}
}

class deduplicateFactory {

}

function Get-DetermineHardwareArchitecture {
	if ((Get-ChildItem -Path Env:) -eq $IsWindows) {
		#Determine hardware architecture.
		# 32-bit = [IntPtr]::Size -eq 4
		# 64-bit = [IntPrt]::Size -eq 8
		if ([IntPtr]::Size -eq 4) {
			Write-Verbose "$(Get-Date): Hardware architecture 32-bit"
			return "32-bit"
		}
		elseif ([IntPtr]::Size -eq 8) {
			Write-Verbose "$(Get-Date): Hardware architecture 64-bit"
			return "64-bit"
		}
		else {
			Write-Verbose "$(Get-Date): Hardware architecture Unknown"
			return "Unknown"
		}
	}
}

function Get-CheckDuplicate {
	[CmdletBinding()]
	param (
		[Parameter(Mandatory = $true)]
		[String]$fileFullName,
		[Parameter(Mandatory = $true)]
		[Int64]$fileLength,
		[Parameter(Mandatory = $false)]
		[String]$hash
	)

	if ($hash -ne $null) {
		$global:fileHashes | Where-Object { $_[2] -eq $fileLength -and $_[1] -eq $hash } | ForEach-Object {
			if ($global:fileHashes[$global:fileHashes.IndexOf($_)][1] -eq $hash) {
				$global:duplicates += @(, @($global:fileHashes[$global:fileHashes.IndexOf($_)][0], $fileFullName, $fileLength))
				return
			}
		}
		$hash = "[Unknown]"
	}
	$global:fileHashes += @(, @($fileFullName, $hash, $fileLength))
}

function Get-FileContent {
	[CmdletBinding()]
	param (
		[Parameter(Mandatory = $true)]
		[Object]$folderContent
	)

	$folderContent | ForEach-Object {
		$stopWatchFileHash.StartTime()
		if (Test-Path -IsValid -Path "$($_.FullName)" -ErrorAction SilentlyContinue) {
			if ($_.length -gt 0) {
				Try {
					$thisFile = Get-FileHash -Algorithm $hashType -Path "$($_.FullName)"
					$stopWatchFileHash.EndTime()
				}
				Catch [System.UnauthorizedAccessException] {
					Write-Verbose "$(Get-Date) Warning: Unauthorized Access Exception"
					$global:accessDenied += @(, @("$($_.FullName)"))
				}
				Catch {
					Write-Verbose "$(Get-Date) Warning: An unforseen error has happend."
					Write-Verbose "$(Get-Date) Warning: During the hash routine."
					Write-Verbose "$(Get-Date) Warning: Create message: $CreateMsg"
					Write-Verbose "$(Get-Date) Warning: Last error: $($Error[0])"
					Exit
				}
				$stopWatchFileHash.DifferenceBetweenTimes()
				$stopWatchCheckDuplicate.StartTime()
				Get-CheckDuplicate -fileFullName $_.FullName -fileLength $_.length -hash $thisFile.hash
				$stopWatchCheckDuplicate.EndTime()
				$stopWatchCheckDuplicate.DifferenceBetweenTimes()
				$global:sytemTaskes.ProgressBar(2, $folderContent.IndexOf($_), $folderContent.Count, "($($folderContent.IndexOf($_))/$($folderContent.Count)) Hashing: $($stopWatchFileHash.Difference) Deduplicatng: $($stopWatchCheckDuplicate.Difference) HASH: $($thisFile.Hash) Size: $(Get-CoverToByteSize $_.length) Name: $($_.Name)")
			}
			else {
				$global:ignoreds += @(, @($_.FullName, $_.length))
			}
		}
		else {
			try {
				Get-Item -Path $_.FullName -ErrorAction Stop -ErrorVariable GIError
			}
			Catch [System.UnauthorizedAccessException] {
				$global:accessDenied += @(, @($_.FullName))
			}
			Catch {
				Write-Warning "An unforseen error has happend."
				Write-Warning "During the hash routine."
				Write-Warning "Last error: $($GIError.Exception)"
			}
		}
	}
}

function Get-FolderContent {
	[CmdletBinding()]
	param (
		[Parameter(Mandatory = $true)]
		[String]$path
	)

	Write-Verbose "$(Get-Date): Getting content of folder $($_)"
	return (Get-ChildItem -Path $path -File -Recurse -ErrorAction SilentlyContinue)
}

function Get-CoverToByteSize {
	[CmdletBinding()]
	param (
		[Parameter(Mandatory = $true)]
		[Int64]$size
	)

	if ($size -ge 0) { $result = "$([math]::Round($size)) $($global:BytePrefixNames[0])" }
	for ($counter = 1; $counter -le 8; $counter++) {
		if ($size -ge [Math]::Pow(1024, $counter)) { $result = "$([math]::Round($size / [Math]::Pow(1024, $counter))) $($global:BytePrefixNames[$counter])" }
	}
	return $result
}

function Get-DuplicatesFromPaths {
	[CmdletBinding()]
	Param (
		[Parameter(Mandatory = $true)]
		[Array]$paths
	)

	$paths | ForEach-Object {
		if (Test-Path $_) {
			$global:sytemTaskes.ProgressBar(1, $paths.IndexOf($_), $paths.Count, "($($paths.IndexOf($_)+1)/$($paths.Count)) path $($_)") 
			Get-FileContent -folderContent (Get-FolderContent -path $_)
		}
		else {
			Write-Verbose "$(Get-Date): Folder $($_) doesn't exist"          
		}
	}
}

function Get-DuplicatesExportToJson {
	$global:duplicates | ConvertTo-Json | Set-Content -path .\export.json
}

function Get-RemoveExportJson {
	if (Test-Path -Path .\export.json ) {
		Remove-Item .\export.json
	}
}

function Get-Information {
	$global:ignoreds | ForEach-Object { $sizeIgnored += $($_[1]) }
	$global:fileHashes | ForEach-Object { $sizeHased += $($_[2]) }
	$global:duplicates | ForEach-Object { $sizeDuplicate += $($_[2]) }

	if (![System.Convert]::ToBoolean($($showGui))) {
		Write-Host "$(Get-Date): Acces denied files $($global:accessDenied.Count)"
		Write-Host "$(Get-Date): Known files ignored $($global:ignoreds.Count)"
		Write-Host "$(Get-Date): Sum of known ignored files $(Get-CoverToByteSize $sizeIgnored)"
		Write-Host "$(Get-Date): Files hashed $($global:fileHashes.Count)"
		Write-Host "$(Get-Date): Sum of hashed files $(Get-CoverToByteSize $sizeHased)"
		Write-Host "$(Get-Date): Duplicates found $($global:duplicates.Count)"
		Write-Host "$(Get-Date): Sum of duplicate files $(Get-CoverToByteSize $sizeDuplicate)"
	}
	else {
		$text = $null;
		$text += "Acces denied files: $($global:accessDenied.Count)`n"
		$text += "Known files ignored: $($global:ignoreds.Count)`n"
		$text += "Sum of known ignored files: $(Get-CoverToByteSize $sizeIgnored)`n"
		$text += "Files hashed: $($global:fileHashes.Count)`n"
		$text += "Sum of hashed files: $(Get-CoverToByteSize $sizeHased)`n"
		$text += "Duplicates found: $($global:duplicates.Count)`n"
		$text += "Sum of duplicate files: $(Get-CoverToByteSize $sizeDuplicate)`n"
		New-DrawWindow -text $text
	}
}

function DoExec {
	[CmdletBinding()]
	Param (
		[Parameter(Mandatory = $true)]
		[Object]$objForm
	)

	$objForm.Close()
}

function Get-ApplicationIcon {
	$PF32bit = ${Env:WinDir} + "/System32/"
	$PF64bit = ${Env:WinDir} + "/SysWOW64/"
		
	Switch (Get-DetermineHardwareArchitecture) {
		"32-bit" {
			return [system.drawing.icon]::ExtractAssociatedIcon(“$PF32bit\WindowsPowerShell\v1.0\powershell.exe”)
		}
		"64-bit" {
			return [system.drawing.icon]::ExtractAssociatedIcon(“$PF64bit\WindowsPowerShell\v1.0\powershell.exe”)
		}
		Default {
			return $null
		}
	}
}

function New-DrawWindow {
	[CmdletBinding()]
	Param (
		[Parameter(Mandatory = $true)]
		[String]$text
	)

	[void] [System.Reflection.Assembly]::LoadWithPartialName(“System.Drawing”)
	[void] [System.Reflection.Assembly]::LoadWithPartialName(“System.Windows.Forms”)

	#Setting constants for GUI.
	$F_Width = 400
	$F_Height = 180
	$B_PosX = 200
	$B_PosY = 100
	$B_Width = 75
	$B_Height = 23
	
	$objForm = New-Object System.Windows.Forms.Form
	$objForm.Text = “Powershell - Deduplicate”
	$objForm.Size = New-Object System.Drawing.Size($F_Width, $F_Height)
	$objForm.StartPosition = “CenterScreen”
	$objForm.Icon = Get-ApplicationIcon

	$objForm.KeyPreview = $true
	$objForm.Add_KeyDown( { if ($_.KeyCode -eq “Enter”) { DoExec($objForm) } })
	$objForm.Add_KeyDown( { if ($_.KeyCode -eq “Escape”) { DoExec($objForm) } })

	$OKButton = New-Object System.Windows.Forms.Button
	$OKButton.Location = New-Object System.Drawing.Size($B_PosX, $B_PosY)
	$OKButton.Size = New-Object System.Drawing.Size($B_Width, $B_Height)
	$OKButton.Text = “OK”
	$OKButton.Add_Click( { DoExec($objForm) })
	$objForm.Controls.Add($OKButton)

	$CancelButton = New-Object System.Windows.Forms.Button
	$CancelButton.Location = New-Object System.Drawing.Size(($B_PosX + $B_Width), $B_PosY)
	$CancelButton.Size = New-Object System.Drawing.Size($B_Width, $B_Height)
	$CancelButton.Text = “Cancel”
	$CancelButton.Add_Click( { DoExec($objForm) })
	$objForm.Controls.Add($CancelButton)

	$objLabel = New-Object System.Windows.Forms.Label
	$objLabel.Location = New-Object System.Drawing.Size(20, 20)
	$objLabel.Size = New-Object System.Drawing.Size($F_Width, 250)
	$objLabel.Text = $text

	$objForm.Controls.Add($objLabel)

	$objForm.Topmost = $true

	$objForm.Add_Shown( { $objForm.Activate() })
	[void] $objForm.ShowDialog()

	$DrawTextBox
}

#Setting global variables.
[Array]$global:duplicates = @()
[Array]$global:fileHashes = @()
[Array]$global:ignoreds = @()
[Array]$global:accessDenied = @()
[Array]$global:BytePrefixNames = ("bytes", "Kib", "Mib", "Gib", "Tib", "Pib", "Eib", "Zib", "Yib") 
[stopWatch]$global:stopWatchFileHash = [stopWatch]::New()
[stopWatch]$global:stopWatchCheckDuplicate = [stopWatch]::New()
[systemTaskes]$global:sytemTaskes = [systemTaskes]::New()

Clear-Host
Write-Host "$(Get-Date): Deduplicate Script begins"
if ([System.Convert]::ToBoolean($($unitTest))) {
	# Pester tests
	Describe 'Unit-test' {
		for ($loopCounter=0; $loopCounter -lt 10; $loopCounter++) {
			$randomValue = Get-Random -Maximum 9
			It "Stopwatch should register close to $randomValue seconds" {
				$global:stopWatchFileHash.StartTime()
				sleep $randomValue 
				$global:stopWatchFileHash.EndTime()
				$global:stopWatchFileHash.DifferenceBetweenTimes()
				$global:stopWatchFileHash.Difference | Should -BeLike ("00:00:0$($randomValue).0*")
			}
		}
		It "There should be atleast 9 byte prefix names" {
			$BytePrefixNames.Count | Should -BeGreaterThan 8
		}
		#       It 'All files should be proccesed' {
		#           $paths | ForEach-Object {
		#               $totalFileCount += $paths.IndexOf($_).Count
		#           }         
		#           $fileHashes.Count + $ignoreds.Count + $accessDenied.count + 1 | Should -HaveCount $totalFileCount
		#       }

		"Hello world!!!" | Set-Content -path .\export.json

		It 'Removes Export.json' {
			Mock -CommandName Remove-Item -MockWith { }
	
			Get-RemoveExportJson
	
			Assert-MockCalled -CommandName Remove-Item -Times 1 -Exactly
		}

		if (Test-Path -Path .\export.json ) {
			Remove-Item .\export.json -Force
		}

		$global:duplicates = "hello world!!"

		It 'Export.json should exists in the script folder' {
			Mock -CommandName Set-Content -MockWith { }
	
			Get-DuplicatesExportToJson
	
			Assert-MockCalled -CommandName Set-Content -Times 1 -Exactly
		}
	}
}
else {
	Write-Host "$(Get-Date): Getting content from given path(s)."
	Write-Host "$(Get-Date): This may take awhile dependig on size of the filestructure."
	Get-RemoveExportJson
	Get-DuplicatesFromPaths -paths $paths
	Get-Information
	Get-DuplicatesExportToJson
}
Write-Host "$(Get-Date): Deduplicate Script completed!"

Tips

  • Dit script is alleen getest op Windows 10.
  • Vereist PowerShell 5.1.
  • UNC-paden werken ook als invoerwaarden.
  • Het kan nodig zijn om uit te voeren: Set-ExecutionPolicy -ExecutionPolicy RemoteSigned

Licentie

MIT

Copyright (c) 2019 Antoine Engelen
Hierbij wordt gratis toestemming verleend aan iedereen die een kopie van deze software en bijbehorende documentatiebestanden (de "Software") verkrijgt, om te gebruiken zonder enige beperking...

Labels