From 8c89cc2489f9ddf391e18539a036a3d244679d70 Mon Sep 17 00:00:00 2001 From: Paul Campbell Date: Tue, 11 Jun 2019 20:38:14 +0100 Subject: [PATCH] Error when calculating MD5Hash for large files (#56) * [domain] SizeTranslation includes decimals for larger sizes * [core] MD5HashGenerator rewrite for memory efficiency No longer attempt to create an Array the size of the file to be parsed. Now it creates a single small buffer and reads 8kb chunks in at a time. Only creating an additional smaller buffer to read the tail of the file. Remove methods to parsing only part of a file are they were no longer used, and remove the relevant tests. --- .../MD5HashGenerator.scala | 66 +++++++++++-------- .../s3thorp/core/MD5HashGeneratorTest.scala | 27 -------- .../s3thorp/domain/SizeTranslation.scala | 12 ++-- .../s3thorp/domain/SizeTranslationTest.scala | 35 ++++++++++ 4 files changed, 82 insertions(+), 58 deletions(-) create mode 100644 domain/src/test/scala/net/kemitix/s3thorp/domain/SizeTranslationTest.scala diff --git a/core/src/main/scala/net.kemitix.s3thorp.core/MD5HashGenerator.scala b/core/src/main/scala/net.kemitix.s3thorp.core/MD5HashGenerator.scala index 8095cbe..9f9a247 100644 --- a/core/src/main/scala/net.kemitix.s3thorp.core/MD5HashGenerator.scala +++ b/core/src/main/scala/net.kemitix.s3thorp.core/MD5HashGenerator.scala @@ -6,45 +6,57 @@ import java.security.MessageDigest import cats.effect.IO import net.kemitix.s3thorp.domain.MD5Hash +import scala.collection.immutable.NumericRange + object MD5HashGenerator { def md5File(file: File) - (implicit info: Int => String => IO[Unit]): IO[MD5Hash] = - md5FilePart(file, 0, file.length) + (implicit info: Int => String => IO[Unit]): IO[MD5Hash] = { - def md5FilePart(file: File, - offset: Long, - size: Long) - (implicit info: Int => String => IO[Unit]): IO[MD5Hash] = { - val buffer = new Array[Byte](size.toInt) + val maxBufferSize = 8048 - def readIntoBuffer = { - fis: FileInputStream => - IO { - fis skip offset - fis read buffer - fis - } - } - - def closeFile = {fis: FileInputStream => IO(fis.close())} + val defaultBuffer = new Array[Byte](maxBufferSize) def openFile = IO(new FileInputStream(file)) - def readFile = openFile.bracket(readIntoBuffer)(closeFile) + def closeFile = {fis: FileInputStream => IO(fis.close())} + + def nextChunkSize(currentOffset: Long) = { + // a value between 1 and maxBufferSize + val toRead = file.length - currentOffset + val result = Math.min(maxBufferSize, toRead) + result.toInt + } + + def readToBuffer(fis: FileInputStream, + currentOffset: Long) = { + val buffer = + if (nextChunkSize(currentOffset) < maxBufferSize) + new Array[Byte](nextChunkSize(currentOffset)) + else + defaultBuffer + fis read buffer + buffer + } + + def readFile: IO[String] = openFile + .bracket(fis => IO { + val md5 = MessageDigest getInstance "MD5" + NumericRange(0, file.length, maxBufferSize) + .foreach{currentOffset => { + val buffer = readToBuffer(fis, currentOffset) + md5 update buffer + } + } + (md5.digest map ("%02x" format _)).mkString + })(closeFile) for { - _ <- info(5)(s"md5:reading:offset $offset:size $size:$file") - _ <- readFile - hash = md5PartBody(buffer) + _ <- info(5)(s"md5:reading:size ${file.length}:$file") + md5 <- readFile + hash = MD5Hash(md5) _ <- info(4)(s"md5:generated:${hash.hash}:$file") } yield hash } - def md5PartBody(partBody: Array[Byte]): MD5Hash = { - val md5 = MessageDigest getInstance "MD5" - md5 update partBody - MD5Hash((md5.digest map ("%02x" format _)).mkString) - } - } diff --git a/core/src/test/scala/net/kemitix/s3thorp/core/MD5HashGeneratorTest.scala b/core/src/test/scala/net/kemitix/s3thorp/core/MD5HashGeneratorTest.scala index 439fdf0..c71000c 100644 --- a/core/src/test/scala/net/kemitix/s3thorp/core/MD5HashGeneratorTest.scala +++ b/core/src/test/scala/net/kemitix/s3thorp/core/MD5HashGeneratorTest.scala @@ -21,14 +21,6 @@ class MD5HashGeneratorTest extends FunSpec { assertResult(rootHash)(result) } } - describe("read a buffer") { - val file = Resource(this, "upload/root-file") - val buffer: Array[Byte] = Files.readAllBytes(file.toPath) - it("should generate the correct hash") { - val result = MD5HashGenerator.md5PartBody(buffer) - assertResult(rootHash)(result) - } - } describe("read a large file (bigger than buffer)") { val file = Resource(this, "big-file") it("should generate the correct hash") { @@ -37,24 +29,5 @@ class MD5HashGeneratorTest extends FunSpec { assertResult(expected)(result) } } - describe("read part of a file") { - val file = Resource(this, "big-file") - val halfFileLength = file.length / 2 - assertResult(file.length)(halfFileLength * 2) - describe("when starting at the beginning of the file") { - it("should generate the correct hash") { - val expected = MD5Hash("aadf0d266cefe0fcdb241a51798d74b3") - val result = MD5HashGenerator.md5FilePart(file, 0, halfFileLength).unsafeRunSync - assertResult(expected)(result) - } - } - describe("when starting in the middle of the file") { - it("should generate the correct hash") { - val expected = MD5Hash("16e08d53ca36e729d808fd5e4f7e35dc") - val result = MD5HashGenerator.md5FilePart(file, halfFileLength, halfFileLength).unsafeRunSync - assertResult(expected)(result) - } - } - } } diff --git a/domain/src/main/scala/net/kemitix/s3thorp/domain/SizeTranslation.scala b/domain/src/main/scala/net/kemitix/s3thorp/domain/SizeTranslation.scala index ee21975..eabc885 100644 --- a/domain/src/main/scala/net/kemitix/s3thorp/domain/SizeTranslation.scala +++ b/domain/src/main/scala/net/kemitix/s3thorp/domain/SizeTranslation.scala @@ -2,11 +2,15 @@ package net.kemitix.s3thorp.domain object SizeTranslation { + val kbLimit = 10240L + val mbLimit = kbLimit * 1024 + val gbLimit = mbLimit * 1024 + def sizeInEnglish(length: Long): String = - length match { - case bytes if bytes > 1024 * 1024 * 1024 => s"${bytes / 1024 / 1024 /1024}Gb" - case bytes if bytes > 1024 * 1024 => s"${bytes / 1024 / 1024}Mb" - case bytes if bytes > 1024 => s"${bytes / 1024}Kb" + length.toDouble match { + case bytes if bytes > gbLimit => f"${bytes / 1024 / 1024 /1024}%.3fGb" + case bytes if bytes > mbLimit => f"${bytes / 1024 / 1024}%.2fMb" + case bytes if bytes > kbLimit => f"${bytes / 1024}%.0fKb" case bytes => s"${length}b" } diff --git a/domain/src/test/scala/net/kemitix/s3thorp/domain/SizeTranslationTest.scala b/domain/src/test/scala/net/kemitix/s3thorp/domain/SizeTranslationTest.scala new file mode 100644 index 0000000..48b3141 --- /dev/null +++ b/domain/src/test/scala/net/kemitix/s3thorp/domain/SizeTranslationTest.scala @@ -0,0 +1,35 @@ +package net.kemitix.s3thorp.domain + +import org.scalatest.FunSpec + +class SizeTranslationTest extends FunSpec { + + describe("sizeInEnglish") { + describe("when size is less the 1Kb") { + it("should in in bytes") { + assertResult("512b")(SizeTranslation.sizeInEnglish(512)) + } + } + describe("when size is a less than 10Kb") { + it("should still be in bytes") { + assertResult("2000b")(SizeTranslation.sizeInEnglish(2000)) + } + } + describe("when size is over 10Kb and less than 10Mb") { + it("should be in Kb with zero decimal places") { + assertResult("5468Kb")(SizeTranslation.sizeInEnglish(5599232)) + } + } + describe("when size is over 10Mb and less than 10Gb") { + it("should be in Mb with two decimal place") { + assertResult("5468.17Mb")(SizeTranslation.sizeInEnglish(5733789833L)) + } + } + describe("when size is over 10Gb") { + it("should be in Gb with three decimal place") { + assertResult("5468.168Gb")(SizeTranslation.sizeInEnglish(5871400857278L)) + } + } + } + +}