Error when calculating MD5Hash for large files (#56)

* [domain] SizeTranslation includes decimals for larger sizes

* [core] MD5HashGenerator rewrite for memory efficiency

No longer attempt to create an Array the size of the file to be
parsed.

Now it creates a single small buffer and reads 8kb chunks in at a
time. Only creating an additional smaller buffer to read the tail of
the file.

Remove methods to parsing only part of a file are they were no longer
used, and remove the relevant tests.
This commit is contained in:
Paul Campbell 2019-06-11 20:38:14 +01:00 committed by GitHub
parent ff1adf0ca4
commit 8c89cc2489
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 82 additions and 58 deletions

View file

@ -6,45 +6,57 @@ import java.security.MessageDigest
import cats.effect.IO
import net.kemitix.s3thorp.domain.MD5Hash
import scala.collection.immutable.NumericRange
object MD5HashGenerator {
def md5File(file: File)
(implicit info: Int => String => IO[Unit]): IO[MD5Hash] =
md5FilePart(file, 0, file.length)
(implicit info: Int => String => IO[Unit]): IO[MD5Hash] = {
def md5FilePart(file: File,
offset: Long,
size: Long)
(implicit info: Int => String => IO[Unit]): IO[MD5Hash] = {
val buffer = new Array[Byte](size.toInt)
val maxBufferSize = 8048
def readIntoBuffer = {
fis: FileInputStream =>
IO {
fis skip offset
fis read buffer
fis
}
}
def closeFile = {fis: FileInputStream => IO(fis.close())}
val defaultBuffer = new Array[Byte](maxBufferSize)
def openFile = IO(new FileInputStream(file))
def readFile = openFile.bracket(readIntoBuffer)(closeFile)
def closeFile = {fis: FileInputStream => IO(fis.close())}
def nextChunkSize(currentOffset: Long) = {
// a value between 1 and maxBufferSize
val toRead = file.length - currentOffset
val result = Math.min(maxBufferSize, toRead)
result.toInt
}
def readToBuffer(fis: FileInputStream,
currentOffset: Long) = {
val buffer =
if (nextChunkSize(currentOffset) < maxBufferSize)
new Array[Byte](nextChunkSize(currentOffset))
else
defaultBuffer
fis read buffer
buffer
}
def readFile: IO[String] = openFile
.bracket(fis => IO {
val md5 = MessageDigest getInstance "MD5"
NumericRange(0, file.length, maxBufferSize)
.foreach{currentOffset => {
val buffer = readToBuffer(fis, currentOffset)
md5 update buffer
}
}
(md5.digest map ("%02x" format _)).mkString
})(closeFile)
for {
_ <- info(5)(s"md5:reading:offset $offset:size $size:$file")
_ <- readFile
hash = md5PartBody(buffer)
_ <- info(5)(s"md5:reading:size ${file.length}:$file")
md5 <- readFile
hash = MD5Hash(md5)
_ <- info(4)(s"md5:generated:${hash.hash}:$file")
} yield hash
}
def md5PartBody(partBody: Array[Byte]): MD5Hash = {
val md5 = MessageDigest getInstance "MD5"
md5 update partBody
MD5Hash((md5.digest map ("%02x" format _)).mkString)
}
}

View file

@ -21,14 +21,6 @@ class MD5HashGeneratorTest extends FunSpec {
assertResult(rootHash)(result)
}
}
describe("read a buffer") {
val file = Resource(this, "upload/root-file")
val buffer: Array[Byte] = Files.readAllBytes(file.toPath)
it("should generate the correct hash") {
val result = MD5HashGenerator.md5PartBody(buffer)
assertResult(rootHash)(result)
}
}
describe("read a large file (bigger than buffer)") {
val file = Resource(this, "big-file")
it("should generate the correct hash") {
@ -37,24 +29,5 @@ class MD5HashGeneratorTest extends FunSpec {
assertResult(expected)(result)
}
}
describe("read part of a file") {
val file = Resource(this, "big-file")
val halfFileLength = file.length / 2
assertResult(file.length)(halfFileLength * 2)
describe("when starting at the beginning of the file") {
it("should generate the correct hash") {
val expected = MD5Hash("aadf0d266cefe0fcdb241a51798d74b3")
val result = MD5HashGenerator.md5FilePart(file, 0, halfFileLength).unsafeRunSync
assertResult(expected)(result)
}
}
describe("when starting in the middle of the file") {
it("should generate the correct hash") {
val expected = MD5Hash("16e08d53ca36e729d808fd5e4f7e35dc")
val result = MD5HashGenerator.md5FilePart(file, halfFileLength, halfFileLength).unsafeRunSync
assertResult(expected)(result)
}
}
}
}

View file

@ -2,11 +2,15 @@ package net.kemitix.s3thorp.domain
object SizeTranslation {
val kbLimit = 10240L
val mbLimit = kbLimit * 1024
val gbLimit = mbLimit * 1024
def sizeInEnglish(length: Long): String =
length match {
case bytes if bytes > 1024 * 1024 * 1024 => s"${bytes / 1024 / 1024 /1024}Gb"
case bytes if bytes > 1024 * 1024 => s"${bytes / 1024 / 1024}Mb"
case bytes if bytes > 1024 => s"${bytes / 1024}Kb"
length.toDouble match {
case bytes if bytes > gbLimit => f"${bytes / 1024 / 1024 /1024}%.3fGb"
case bytes if bytes > mbLimit => f"${bytes / 1024 / 1024}%.2fMb"
case bytes if bytes > kbLimit => f"${bytes / 1024}%.0fKb"
case bytes => s"${length}b"
}

View file

@ -0,0 +1,35 @@
package net.kemitix.s3thorp.domain
import org.scalatest.FunSpec
class SizeTranslationTest extends FunSpec {
describe("sizeInEnglish") {
describe("when size is less the 1Kb") {
it("should in in bytes") {
assertResult("512b")(SizeTranslation.sizeInEnglish(512))
}
}
describe("when size is a less than 10Kb") {
it("should still be in bytes") {
assertResult("2000b")(SizeTranslation.sizeInEnglish(2000))
}
}
describe("when size is over 10Kb and less than 10Mb") {
it("should be in Kb with zero decimal places") {
assertResult("5468Kb")(SizeTranslation.sizeInEnglish(5599232))
}
}
describe("when size is over 10Mb and less than 10Gb") {
it("should be in Mb with two decimal place") {
assertResult("5468.17Mb")(SizeTranslation.sizeInEnglish(5733789833L))
}
}
describe("when size is over 10Gb") {
it("should be in Gb with three decimal place") {
assertResult("5468.168Gb")(SizeTranslation.sizeInEnglish(5871400857278L))
}
}
}
}