Error when calculating MD5Hash for large files (#56)
* [domain] SizeTranslation includes decimals for larger sizes * [core] MD5HashGenerator rewrite for memory efficiency No longer attempt to create an Array the size of the file to be parsed. Now it creates a single small buffer and reads 8kb chunks in at a time. Only creating an additional smaller buffer to read the tail of the file. Remove methods to parsing only part of a file are they were no longer used, and remove the relevant tests.
This commit is contained in:
parent
ff1adf0ca4
commit
8c89cc2489
4 changed files with 82 additions and 58 deletions
|
@ -6,45 +6,57 @@ import java.security.MessageDigest
|
|||
import cats.effect.IO
|
||||
import net.kemitix.s3thorp.domain.MD5Hash
|
||||
|
||||
import scala.collection.immutable.NumericRange
|
||||
|
||||
object MD5HashGenerator {
|
||||
|
||||
def md5File(file: File)
|
||||
(implicit info: Int => String => IO[Unit]): IO[MD5Hash] =
|
||||
md5FilePart(file, 0, file.length)
|
||||
|
||||
def md5FilePart(file: File,
|
||||
offset: Long,
|
||||
size: Long)
|
||||
(implicit info: Int => String => IO[Unit]): IO[MD5Hash] = {
|
||||
val buffer = new Array[Byte](size.toInt)
|
||||
|
||||
def readIntoBuffer = {
|
||||
fis: FileInputStream =>
|
||||
IO {
|
||||
fis skip offset
|
||||
fis read buffer
|
||||
fis
|
||||
}
|
||||
}
|
||||
val maxBufferSize = 8048
|
||||
|
||||
def closeFile = {fis: FileInputStream => IO(fis.close())}
|
||||
val defaultBuffer = new Array[Byte](maxBufferSize)
|
||||
|
||||
def openFile = IO(new FileInputStream(file))
|
||||
|
||||
def readFile = openFile.bracket(readIntoBuffer)(closeFile)
|
||||
def closeFile = {fis: FileInputStream => IO(fis.close())}
|
||||
|
||||
def nextChunkSize(currentOffset: Long) = {
|
||||
// a value between 1 and maxBufferSize
|
||||
val toRead = file.length - currentOffset
|
||||
val result = Math.min(maxBufferSize, toRead)
|
||||
result.toInt
|
||||
}
|
||||
|
||||
def readToBuffer(fis: FileInputStream,
|
||||
currentOffset: Long) = {
|
||||
val buffer =
|
||||
if (nextChunkSize(currentOffset) < maxBufferSize)
|
||||
new Array[Byte](nextChunkSize(currentOffset))
|
||||
else
|
||||
defaultBuffer
|
||||
fis read buffer
|
||||
buffer
|
||||
}
|
||||
|
||||
def readFile: IO[String] = openFile
|
||||
.bracket(fis => IO {
|
||||
val md5 = MessageDigest getInstance "MD5"
|
||||
NumericRange(0, file.length, maxBufferSize)
|
||||
.foreach{currentOffset => {
|
||||
val buffer = readToBuffer(fis, currentOffset)
|
||||
md5 update buffer
|
||||
}
|
||||
}
|
||||
(md5.digest map ("%02x" format _)).mkString
|
||||
})(closeFile)
|
||||
|
||||
for {
|
||||
_ <- info(5)(s"md5:reading:offset $offset:size $size:$file")
|
||||
_ <- readFile
|
||||
hash = md5PartBody(buffer)
|
||||
_ <- info(5)(s"md5:reading:size ${file.length}:$file")
|
||||
md5 <- readFile
|
||||
hash = MD5Hash(md5)
|
||||
_ <- info(4)(s"md5:generated:${hash.hash}:$file")
|
||||
} yield hash
|
||||
}
|
||||
|
||||
def md5PartBody(partBody: Array[Byte]): MD5Hash = {
|
||||
val md5 = MessageDigest getInstance "MD5"
|
||||
md5 update partBody
|
||||
MD5Hash((md5.digest map ("%02x" format _)).mkString)
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -21,14 +21,6 @@ class MD5HashGeneratorTest extends FunSpec {
|
|||
assertResult(rootHash)(result)
|
||||
}
|
||||
}
|
||||
describe("read a buffer") {
|
||||
val file = Resource(this, "upload/root-file")
|
||||
val buffer: Array[Byte] = Files.readAllBytes(file.toPath)
|
||||
it("should generate the correct hash") {
|
||||
val result = MD5HashGenerator.md5PartBody(buffer)
|
||||
assertResult(rootHash)(result)
|
||||
}
|
||||
}
|
||||
describe("read a large file (bigger than buffer)") {
|
||||
val file = Resource(this, "big-file")
|
||||
it("should generate the correct hash") {
|
||||
|
@ -37,24 +29,5 @@ class MD5HashGeneratorTest extends FunSpec {
|
|||
assertResult(expected)(result)
|
||||
}
|
||||
}
|
||||
describe("read part of a file") {
|
||||
val file = Resource(this, "big-file")
|
||||
val halfFileLength = file.length / 2
|
||||
assertResult(file.length)(halfFileLength * 2)
|
||||
describe("when starting at the beginning of the file") {
|
||||
it("should generate the correct hash") {
|
||||
val expected = MD5Hash("aadf0d266cefe0fcdb241a51798d74b3")
|
||||
val result = MD5HashGenerator.md5FilePart(file, 0, halfFileLength).unsafeRunSync
|
||||
assertResult(expected)(result)
|
||||
}
|
||||
}
|
||||
describe("when starting in the middle of the file") {
|
||||
it("should generate the correct hash") {
|
||||
val expected = MD5Hash("16e08d53ca36e729d808fd5e4f7e35dc")
|
||||
val result = MD5HashGenerator.md5FilePart(file, halfFileLength, halfFileLength).unsafeRunSync
|
||||
assertResult(expected)(result)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -2,11 +2,15 @@ package net.kemitix.s3thorp.domain
|
|||
|
||||
object SizeTranslation {
|
||||
|
||||
val kbLimit = 10240L
|
||||
val mbLimit = kbLimit * 1024
|
||||
val gbLimit = mbLimit * 1024
|
||||
|
||||
def sizeInEnglish(length: Long): String =
|
||||
length match {
|
||||
case bytes if bytes > 1024 * 1024 * 1024 => s"${bytes / 1024 / 1024 /1024}Gb"
|
||||
case bytes if bytes > 1024 * 1024 => s"${bytes / 1024 / 1024}Mb"
|
||||
case bytes if bytes > 1024 => s"${bytes / 1024}Kb"
|
||||
length.toDouble match {
|
||||
case bytes if bytes > gbLimit => f"${bytes / 1024 / 1024 /1024}%.3fGb"
|
||||
case bytes if bytes > mbLimit => f"${bytes / 1024 / 1024}%.2fMb"
|
||||
case bytes if bytes > kbLimit => f"${bytes / 1024}%.0fKb"
|
||||
case bytes => s"${length}b"
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,35 @@
|
|||
package net.kemitix.s3thorp.domain
|
||||
|
||||
import org.scalatest.FunSpec
|
||||
|
||||
class SizeTranslationTest extends FunSpec {
|
||||
|
||||
describe("sizeInEnglish") {
|
||||
describe("when size is less the 1Kb") {
|
||||
it("should in in bytes") {
|
||||
assertResult("512b")(SizeTranslation.sizeInEnglish(512))
|
||||
}
|
||||
}
|
||||
describe("when size is a less than 10Kb") {
|
||||
it("should still be in bytes") {
|
||||
assertResult("2000b")(SizeTranslation.sizeInEnglish(2000))
|
||||
}
|
||||
}
|
||||
describe("when size is over 10Kb and less than 10Mb") {
|
||||
it("should be in Kb with zero decimal places") {
|
||||
assertResult("5468Kb")(SizeTranslation.sizeInEnglish(5599232))
|
||||
}
|
||||
}
|
||||
describe("when size is over 10Mb and less than 10Gb") {
|
||||
it("should be in Mb with two decimal place") {
|
||||
assertResult("5468.17Mb")(SizeTranslation.sizeInEnglish(5733789833L))
|
||||
}
|
||||
}
|
||||
describe("when size is over 10Gb") {
|
||||
it("should be in Gb with three decimal place") {
|
||||
assertResult("5468.168Gb")(SizeTranslation.sizeInEnglish(5871400857278L))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in a new issue