Error when calculating MD5Hash for large files (#56)

* [domain] SizeTranslation includes decimals for larger sizes * [core] MD5HashGenerator rewrite for memory efficiency No longer attempt to create an Array the size of the file to be parsed. Now it creates a single small buffer and reads 8kb chunks in at a time. Only creating an additional smaller buffer to read the tail of the file. Remove methods to parsing only part of a file are they were no longer used, and remove the relevant tests.
2019-06-11 20:38:14 +01:00 · 2019-06-11 20:38:14 +01:00 · 8c89cc2489
commit 8c89cc2489
parent ff1adf0ca4
4 changed files with 82 additions and 58 deletions
--- a/core/src/main/scala/net.kemitix.s3thorp.core/MD5HashGenerator.scala
+++ b/core/src/main/scala/net.kemitix.s3thorp.core/MD5HashGenerator.scala
@ -6,45 +6,57 @@ import java.security.MessageDigest
 import cats.effect.IO
 import net.kemitix.s3thorp.domain.MD5Hash

+import scala.collection.immutable.NumericRange
+
 object MD5HashGenerator {

  def md5File(file: File)
-             (implicit info: Int => String => IO[Unit]): IO[MD5Hash] =
-    md5FilePart(file, 0, file.length)
+             (implicit info: Int => String => IO[Unit]): IO[MD5Hash] = {

-  def md5FilePart(file: File,
-                  offset: Long,
-                  size: Long)
-                 (implicit info: Int => String => IO[Unit]): IO[MD5Hash] = {
-    val buffer = new Array[Byte](size.toInt)
+    val maxBufferSize = 8048

-    def readIntoBuffer = {
-      fis: FileInputStream =>
-        IO {
-          fis skip offset
-          fis read buffer
-          fis
-        }
-    }
-
-    def closeFile = {fis: FileInputStream => IO(fis.close())}
+    val defaultBuffer = new Array[Byte](maxBufferSize)

    def openFile = IO(new FileInputStream(file))

-    def readFile = openFile.bracket(readIntoBuffer)(closeFile)
+    def closeFile = {fis: FileInputStream => IO(fis.close())}
+
+    def nextChunkSize(currentOffset: Long) = {
+      // a value between 1 and maxBufferSize
+      val toRead = file.length - currentOffset
+      val result = Math.min(maxBufferSize, toRead)
+      result.toInt
+    }
+
+    def readToBuffer(fis: FileInputStream,
+                     currentOffset: Long) = {
+      val buffer =
+        if (nextChunkSize(currentOffset) < maxBufferSize)
+          new Array[Byte](nextChunkSize(currentOffset))
+        else
+          defaultBuffer
+      fis read buffer
+      buffer
+    }
+
+    def readFile: IO[String] = openFile
+      .bracket(fis => IO {
+        val md5 = MessageDigest getInstance "MD5"
+        NumericRange(0, file.length, maxBufferSize)
+          .foreach{currentOffset => {
+            val buffer = readToBuffer(fis, currentOffset)
+            md5 update buffer
+          }
+        }
+        (md5.digest map ("%02x" format _)).mkString
+    })(closeFile)

    for {
-      _ <- info(5)(s"md5:reading:offset $offset:size $size:$file")
-      _ <- readFile
-      hash = md5PartBody(buffer)
+      _ <- info(5)(s"md5:reading:size ${file.length}:$file")
+      md5 <- readFile
+      hash = MD5Hash(md5)
      _ <- info(4)(s"md5:generated:${hash.hash}:$file")
    } yield hash
  }

-  def md5PartBody(partBody: Array[Byte]): MD5Hash = {
-    val md5 = MessageDigest getInstance "MD5"
-    md5 update partBody
-    MD5Hash((md5.digest map ("%02x" format _)).mkString)
-  }
-
 }
--- a/core/src/test/scala/net/kemitix/s3thorp/core/MD5HashGeneratorTest.scala
+++ b/core/src/test/scala/net/kemitix/s3thorp/core/MD5HashGeneratorTest.scala
@ -21,14 +21,6 @@ class MD5HashGeneratorTest extends FunSpec {
        assertResult(rootHash)(result)
      }
    }
-    describe("read a buffer") {
-      val file = Resource(this, "upload/root-file")
-      val buffer: Array[Byte] = Files.readAllBytes(file.toPath)
-      it("should generate the correct hash") {
-        val result = MD5HashGenerator.md5PartBody(buffer)
-        assertResult(rootHash)(result)
-      }
-    }
    describe("read a large file (bigger than buffer)") {
      val file = Resource(this, "big-file")
      it("should generate the correct hash") {
@ -37,24 +29,5 @@ class MD5HashGeneratorTest extends FunSpec {
        assertResult(expected)(result)
      }
    }
-    describe("read part of a file") {
-      val file = Resource(this, "big-file")
-      val halfFileLength = file.length / 2
-      assertResult(file.length)(halfFileLength * 2)
-      describe("when starting at the beginning of the file") {
-        it("should generate the correct hash") {
-          val expected = MD5Hash("aadf0d266cefe0fcdb241a51798d74b3")
-          val result = MD5HashGenerator.md5FilePart(file, 0, halfFileLength).unsafeRunSync
-          assertResult(expected)(result)
-        }
-      }
-      describe("when starting in the middle of the file") {
-        it("should generate the correct hash") {
-          val expected = MD5Hash("16e08d53ca36e729d808fd5e4f7e35dc")
-          val result = MD5HashGenerator.md5FilePart(file, halfFileLength, halfFileLength).unsafeRunSync
-          assertResult(expected)(result)
-        }
-      }
-    }

 }
--- a/domain/src/main/scala/net/kemitix/s3thorp/domain/SizeTranslation.scala
+++ b/domain/src/main/scala/net/kemitix/s3thorp/domain/SizeTranslation.scala
@ -2,11 +2,15 @@ package net.kemitix.s3thorp.domain

 object SizeTranslation {

+  val kbLimit = 10240L
+  val mbLimit = kbLimit * 1024
+  val gbLimit = mbLimit * 1024
+
  def sizeInEnglish(length: Long): String =
-    length match {
-      case bytes if bytes > 1024 * 1024 * 1024 => s"${bytes / 1024 / 1024 /1024}Gb"
-      case bytes if bytes > 1024 * 1024 => s"${bytes / 1024 / 1024}Mb"
-      case bytes if bytes > 1024 => s"${bytes / 1024}Kb"
+    length.toDouble match {
+      case bytes if bytes > gbLimit => f"${bytes / 1024 / 1024 /1024}%.3fGb"
+      case bytes if bytes > mbLimit => f"${bytes / 1024 / 1024}%.2fMb"
+      case bytes if bytes > kbLimit => f"${bytes / 1024}%.0fKb"
      case bytes => s"${length}b"
    }

--- a/domain/src/test/scala/net/kemitix/s3thorp/domain/SizeTranslationTest.scala
+++ b/domain/src/test/scala/net/kemitix/s3thorp/domain/SizeTranslationTest.scala
@ -0,0 +1,35 @@
+package net.kemitix.s3thorp.domain
+
+import org.scalatest.FunSpec
+
+class SizeTranslationTest extends FunSpec {
+
+  describe("sizeInEnglish") {
+    describe("when size is less the 1Kb") {
+      it("should in in bytes") {
+        assertResult("512b")(SizeTranslation.sizeInEnglish(512))
+      }
+    }
+    describe("when size is a less than 10Kb") {
+      it("should still be in bytes") {
+        assertResult("2000b")(SizeTranslation.sizeInEnglish(2000))
+      }
+    }
+    describe("when size is over 10Kb and less than 10Mb") {
+      it("should be in Kb with zero decimal places") {
+        assertResult("5468Kb")(SizeTranslation.sizeInEnglish(5599232))
+      }
+    }
+    describe("when size is over 10Mb and less than 10Gb") {
+      it("should be in Mb with two decimal place") {
+        assertResult("5468.17Mb")(SizeTranslation.sizeInEnglish(5733789833L))
+      }
+    }
+    describe("when size is over 10Gb") {
+      it("should be in Gb with three decimal place") {
+        assertResult("5468.168Gb")(SizeTranslation.sizeInEnglish(5871400857278L))
+      }
+    }
+  }
+
+}