Error when calculating MD5Hash for large files (#56)
* [domain] SizeTranslation includes decimals for larger sizes * [core] MD5HashGenerator rewrite for memory efficiency No longer attempt to create an Array the size of the file to be parsed. Now it creates a single small buffer and reads 8kb chunks in at a time. Only creating an additional smaller buffer to read the tail of the file. Remove methods to parsing only part of a file are they were no longer used, and remove the relevant tests.
This commit is contained in:
parent
ff1adf0ca4
commit
8c89cc2489
4 changed files with 82 additions and 58 deletions
|
@ -6,45 +6,57 @@ import java.security.MessageDigest
|
||||||
import cats.effect.IO
|
import cats.effect.IO
|
||||||
import net.kemitix.s3thorp.domain.MD5Hash
|
import net.kemitix.s3thorp.domain.MD5Hash
|
||||||
|
|
||||||
|
import scala.collection.immutable.NumericRange
|
||||||
|
|
||||||
object MD5HashGenerator {
|
object MD5HashGenerator {
|
||||||
|
|
||||||
def md5File(file: File)
|
def md5File(file: File)
|
||||||
(implicit info: Int => String => IO[Unit]): IO[MD5Hash] =
|
|
||||||
md5FilePart(file, 0, file.length)
|
|
||||||
|
|
||||||
def md5FilePart(file: File,
|
|
||||||
offset: Long,
|
|
||||||
size: Long)
|
|
||||||
(implicit info: Int => String => IO[Unit]): IO[MD5Hash] = {
|
(implicit info: Int => String => IO[Unit]): IO[MD5Hash] = {
|
||||||
val buffer = new Array[Byte](size.toInt)
|
|
||||||
|
|
||||||
def readIntoBuffer = {
|
val maxBufferSize = 8048
|
||||||
fis: FileInputStream =>
|
|
||||||
IO {
|
|
||||||
fis skip offset
|
|
||||||
fis read buffer
|
|
||||||
fis
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
def closeFile = {fis: FileInputStream => IO(fis.close())}
|
val defaultBuffer = new Array[Byte](maxBufferSize)
|
||||||
|
|
||||||
def openFile = IO(new FileInputStream(file))
|
def openFile = IO(new FileInputStream(file))
|
||||||
|
|
||||||
def readFile = openFile.bracket(readIntoBuffer)(closeFile)
|
def closeFile = {fis: FileInputStream => IO(fis.close())}
|
||||||
|
|
||||||
|
def nextChunkSize(currentOffset: Long) = {
|
||||||
|
// a value between 1 and maxBufferSize
|
||||||
|
val toRead = file.length - currentOffset
|
||||||
|
val result = Math.min(maxBufferSize, toRead)
|
||||||
|
result.toInt
|
||||||
|
}
|
||||||
|
|
||||||
|
def readToBuffer(fis: FileInputStream,
|
||||||
|
currentOffset: Long) = {
|
||||||
|
val buffer =
|
||||||
|
if (nextChunkSize(currentOffset) < maxBufferSize)
|
||||||
|
new Array[Byte](nextChunkSize(currentOffset))
|
||||||
|
else
|
||||||
|
defaultBuffer
|
||||||
|
fis read buffer
|
||||||
|
buffer
|
||||||
|
}
|
||||||
|
|
||||||
|
def readFile: IO[String] = openFile
|
||||||
|
.bracket(fis => IO {
|
||||||
|
val md5 = MessageDigest getInstance "MD5"
|
||||||
|
NumericRange(0, file.length, maxBufferSize)
|
||||||
|
.foreach{currentOffset => {
|
||||||
|
val buffer = readToBuffer(fis, currentOffset)
|
||||||
|
md5 update buffer
|
||||||
|
}
|
||||||
|
}
|
||||||
|
(md5.digest map ("%02x" format _)).mkString
|
||||||
|
})(closeFile)
|
||||||
|
|
||||||
for {
|
for {
|
||||||
_ <- info(5)(s"md5:reading:offset $offset:size $size:$file")
|
_ <- info(5)(s"md5:reading:size ${file.length}:$file")
|
||||||
_ <- readFile
|
md5 <- readFile
|
||||||
hash = md5PartBody(buffer)
|
hash = MD5Hash(md5)
|
||||||
_ <- info(4)(s"md5:generated:${hash.hash}:$file")
|
_ <- info(4)(s"md5:generated:${hash.hash}:$file")
|
||||||
} yield hash
|
} yield hash
|
||||||
}
|
}
|
||||||
|
|
||||||
def md5PartBody(partBody: Array[Byte]): MD5Hash = {
|
|
||||||
val md5 = MessageDigest getInstance "MD5"
|
|
||||||
md5 update partBody
|
|
||||||
MD5Hash((md5.digest map ("%02x" format _)).mkString)
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -21,14 +21,6 @@ class MD5HashGeneratorTest extends FunSpec {
|
||||||
assertResult(rootHash)(result)
|
assertResult(rootHash)(result)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
describe("read a buffer") {
|
|
||||||
val file = Resource(this, "upload/root-file")
|
|
||||||
val buffer: Array[Byte] = Files.readAllBytes(file.toPath)
|
|
||||||
it("should generate the correct hash") {
|
|
||||||
val result = MD5HashGenerator.md5PartBody(buffer)
|
|
||||||
assertResult(rootHash)(result)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
describe("read a large file (bigger than buffer)") {
|
describe("read a large file (bigger than buffer)") {
|
||||||
val file = Resource(this, "big-file")
|
val file = Resource(this, "big-file")
|
||||||
it("should generate the correct hash") {
|
it("should generate the correct hash") {
|
||||||
|
@ -37,24 +29,5 @@ class MD5HashGeneratorTest extends FunSpec {
|
||||||
assertResult(expected)(result)
|
assertResult(expected)(result)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
describe("read part of a file") {
|
|
||||||
val file = Resource(this, "big-file")
|
|
||||||
val halfFileLength = file.length / 2
|
|
||||||
assertResult(file.length)(halfFileLength * 2)
|
|
||||||
describe("when starting at the beginning of the file") {
|
|
||||||
it("should generate the correct hash") {
|
|
||||||
val expected = MD5Hash("aadf0d266cefe0fcdb241a51798d74b3")
|
|
||||||
val result = MD5HashGenerator.md5FilePart(file, 0, halfFileLength).unsafeRunSync
|
|
||||||
assertResult(expected)(result)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
describe("when starting in the middle of the file") {
|
|
||||||
it("should generate the correct hash") {
|
|
||||||
val expected = MD5Hash("16e08d53ca36e729d808fd5e4f7e35dc")
|
|
||||||
val result = MD5HashGenerator.md5FilePart(file, halfFileLength, halfFileLength).unsafeRunSync
|
|
||||||
assertResult(expected)(result)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,11 +2,15 @@ package net.kemitix.s3thorp.domain
|
||||||
|
|
||||||
object SizeTranslation {
|
object SizeTranslation {
|
||||||
|
|
||||||
|
val kbLimit = 10240L
|
||||||
|
val mbLimit = kbLimit * 1024
|
||||||
|
val gbLimit = mbLimit * 1024
|
||||||
|
|
||||||
def sizeInEnglish(length: Long): String =
|
def sizeInEnglish(length: Long): String =
|
||||||
length match {
|
length.toDouble match {
|
||||||
case bytes if bytes > 1024 * 1024 * 1024 => s"${bytes / 1024 / 1024 /1024}Gb"
|
case bytes if bytes > gbLimit => f"${bytes / 1024 / 1024 /1024}%.3fGb"
|
||||||
case bytes if bytes > 1024 * 1024 => s"${bytes / 1024 / 1024}Mb"
|
case bytes if bytes > mbLimit => f"${bytes / 1024 / 1024}%.2fMb"
|
||||||
case bytes if bytes > 1024 => s"${bytes / 1024}Kb"
|
case bytes if bytes > kbLimit => f"${bytes / 1024}%.0fKb"
|
||||||
case bytes => s"${length}b"
|
case bytes => s"${length}b"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,35 @@
|
||||||
|
package net.kemitix.s3thorp.domain
|
||||||
|
|
||||||
|
import org.scalatest.FunSpec
|
||||||
|
|
||||||
|
class SizeTranslationTest extends FunSpec {
|
||||||
|
|
||||||
|
describe("sizeInEnglish") {
|
||||||
|
describe("when size is less the 1Kb") {
|
||||||
|
it("should in in bytes") {
|
||||||
|
assertResult("512b")(SizeTranslation.sizeInEnglish(512))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
describe("when size is a less than 10Kb") {
|
||||||
|
it("should still be in bytes") {
|
||||||
|
assertResult("2000b")(SizeTranslation.sizeInEnglish(2000))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
describe("when size is over 10Kb and less than 10Mb") {
|
||||||
|
it("should be in Kb with zero decimal places") {
|
||||||
|
assertResult("5468Kb")(SizeTranslation.sizeInEnglish(5599232))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
describe("when size is over 10Mb and less than 10Gb") {
|
||||||
|
it("should be in Mb with two decimal place") {
|
||||||
|
assertResult("5468.17Mb")(SizeTranslation.sizeInEnglish(5733789833L))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
describe("when size is over 10Gb") {
|
||||||
|
it("should be in Gb with three decimal place") {
|
||||||
|
assertResult("5468.168Gb")(SizeTranslation.sizeInEnglish(5871400857278L))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
Loading…
Reference in a new issue