Error when calculating MD5Hash for large files (#56)

* [domain] SizeTranslation includes decimals for larger sizes * [core] MD5HashGenerator rewrite for memory efficiency No longer attempt to create an Array the size of the file to be parsed. Now it creates a single small buffer and reads 8kb chunks in at a time. Only creating an additional smaller buffer to read the tail of the file. Remove methods to parsing only part of a file are they were no longer used, and remove the relevant tests.
2019-06-11 20:38:14 +01:00 · 2019-06-11 20:38:14 +01:00 · 8c89cc2489
commit 8c89cc2489
parent ff1adf0ca4
4 changed files with 82 additions and 58 deletions
--- a/core/src/main/scala/net.kemitix.s3thorp.core/MD5HashGenerator.scala
+++ b/core/src/main/scala/net.kemitix.s3thorp.core/MD5HashGenerator.scala
@ -6,45 +6,57 @@ import java.security.MessageDigest
 import cats.effect.IO
 import net.kemitix.s3thorp.domain.MD5Hash
 import scala.collection.immutable.NumericRange
 object MD5HashGenerator {
  def md5File(file: File)
-             (implicit info: Int => String => IO[Unit]): IO[MD5Hash] =
+             (implicit info: Int => String => IO[Unit]): IO[MD5Hash] = {
    md5FilePart(file, 0, file.length)
-  def md5FilePart(file: File,
+    val maxBufferSize = 8048
                  offset: Long,
                  size: Long)
                 (implicit info: Int => String => IO[Unit]): IO[MD5Hash] = {
    val buffer = new Array[Byte](size.toInt)
-    def readIntoBuffer = {
+    val defaultBuffer = new Array[Byte](maxBufferSize)
      fis: FileInputStream =>
        IO {
          fis skip offset
          fis read buffer
          fis
        }
    }
    def closeFile = {fis: FileInputStream => IO(fis.close())}
    def openFile = IO(new FileInputStream(file))
-    def readFile = openFile.bracket(readIntoBuffer)(closeFile)
+    def closeFile = {fis: FileInputStream => IO(fis.close())}
    def nextChunkSize(currentOffset: Long) = {
      // a value between 1 and maxBufferSize
      val toRead = file.length - currentOffset
      val result = Math.min(maxBufferSize, toRead)
      result.toInt
    }
    def readToBuffer(fis: FileInputStream,
                     currentOffset: Long) = {
      val buffer =
        if (nextChunkSize(currentOffset) < maxBufferSize)
          new Array[Byte](nextChunkSize(currentOffset))
        else
          defaultBuffer
      fis read buffer
      buffer
    }
    def readFile: IO[String] = openFile
      .bracket(fis => IO {
        val md5 = MessageDigest getInstance "MD5"
        NumericRange(0, file.length, maxBufferSize)
          .foreach{currentOffset => {
            val buffer = readToBuffer(fis, currentOffset)
            md5 update buffer
          }
        }
        (md5.digest map ("%02x" format _)).mkString
    })(closeFile)
    for {
-      _ <- info(5)(s"md5:reading:offset $offset:size $size:$file")
+      _ <- info(5)(s"md5:reading:size ${file.length}:$file")
-      _ <- readFile
+      md5 <- readFile
-      hash = md5PartBody(buffer)
+      hash = MD5Hash(md5)
      _ <- info(4)(s"md5:generated:${hash.hash}:$file")
    } yield hash
  }
  def md5PartBody(partBody: Array[Byte]): MD5Hash = {
    val md5 = MessageDigest getInstance "MD5"
    md5 update partBody
    MD5Hash((md5.digest map ("%02x" format _)).mkString)
  }
 }
--- a/core/src/test/scala/net/kemitix/s3thorp/core/MD5HashGeneratorTest.scala
+++ b/core/src/test/scala/net/kemitix/s3thorp/core/MD5HashGeneratorTest.scala
@ -21,14 +21,6 @@ class MD5HashGeneratorTest extends FunSpec {
        assertResult(rootHash)(result)
      }
    }
    describe("read a buffer") {
      val file = Resource(this, "upload/root-file")
      val buffer: Array[Byte] = Files.readAllBytes(file.toPath)
      it("should generate the correct hash") {
        val result = MD5HashGenerator.md5PartBody(buffer)
        assertResult(rootHash)(result)
      }
    }
    describe("read a large file (bigger than buffer)") {
      val file = Resource(this, "big-file")
      it("should generate the correct hash") {
@ -37,24 +29,5 @@ class MD5HashGeneratorTest extends FunSpec {
        assertResult(expected)(result)
      }
    }
    describe("read part of a file") {
      val file = Resource(this, "big-file")
      val halfFileLength = file.length / 2
      assertResult(file.length)(halfFileLength * 2)
      describe("when starting at the beginning of the file") {
        it("should generate the correct hash") {
          val expected = MD5Hash("aadf0d266cefe0fcdb241a51798d74b3")
          val result = MD5HashGenerator.md5FilePart(file, 0, halfFileLength).unsafeRunSync
          assertResult(expected)(result)
        }
      }
      describe("when starting in the middle of the file") {
        it("should generate the correct hash") {
          val expected = MD5Hash("16e08d53ca36e729d808fd5e4f7e35dc")
          val result = MD5HashGenerator.md5FilePart(file, halfFileLength, halfFileLength).unsafeRunSync
          assertResult(expected)(result)
        }
      }
    }
 }
--- a/domain/src/main/scala/net/kemitix/s3thorp/domain/SizeTranslation.scala
+++ b/domain/src/main/scala/net/kemitix/s3thorp/domain/SizeTranslation.scala
@ -2,11 +2,15 @@ package net.kemitix.s3thorp.domain
 object SizeTranslation {
  val kbLimit = 10240L
  val mbLimit = kbLimit * 1024
  val gbLimit = mbLimit * 1024
  def sizeInEnglish(length: Long): String =
-    length match {
+    length.toDouble match {
-      case bytes if bytes > 1024 * 1024 * 1024 => s"${bytes / 1024 / 1024 /1024}Gb"
+      case bytes if bytes > gbLimit => f"${bytes / 1024 / 1024 /1024}%.3fGb"
-      case bytes if bytes > 1024 * 1024 => s"${bytes / 1024 / 1024}Mb"
+      case bytes if bytes > mbLimit => f"${bytes / 1024 / 1024}%.2fMb"
-      case bytes if bytes > 1024 => s"${bytes / 1024}Kb"
+      case bytes if bytes > kbLimit => f"${bytes / 1024}%.0fKb"
      case bytes => s"${length}b"
    }
--- a/domain/src/test/scala/net/kemitix/s3thorp/domain/SizeTranslationTest.scala
+++ b/domain/src/test/scala/net/kemitix/s3thorp/domain/SizeTranslationTest.scala
@ -0,0 +1,35 @@
 package net.kemitix.s3thorp.domain
 import org.scalatest.FunSpec
 class SizeTranslationTest extends FunSpec {
  describe("sizeInEnglish") {
    describe("when size is less the 1Kb") {
      it("should in in bytes") {
        assertResult("512b")(SizeTranslation.sizeInEnglish(512))
      }
    }
    describe("when size is a less than 10Kb") {
      it("should still be in bytes") {
        assertResult("2000b")(SizeTranslation.sizeInEnglish(2000))
      }
    }
    describe("when size is over 10Kb and less than 10Mb") {
      it("should be in Kb with zero decimal places") {
        assertResult("5468Kb")(SizeTranslation.sizeInEnglish(5599232))
      }
    }
    describe("when size is over 10Mb and less than 10Gb") {
      it("should be in Mb with two decimal place") {
        assertResult("5468.17Mb")(SizeTranslation.sizeInEnglish(5733789833L))
      }
    }
    describe("when size is over 10Gb") {
      it("should be in Gb with three decimal place") {
        assertResult("5468.168Gb")(SizeTranslation.sizeInEnglish(5871400857278L))
      }
    }
  }
 }