diff --git a/CHANGELOG.org b/CHANGELOG.org index 71fbc4b..2fc4692 100644 --- a/CHANGELOG.org +++ b/CHANGELOG.org @@ -6,6 +6,17 @@ The format is based on [[https://keepachangelog.com/en/1.0.0/][Keep a Changelog] [[https://semver.org/spec/v2.0.0.html][Semantic Versioning]]. +* [1.0.0] - ??? + +** Added + + - Create and use a cache of hashes for local files (#249) + +** Dependencies + + - Revert "[sbt] Rollback ~sbt-ci-release~ to ~1.3.2~ (#231)" + - Update ~sbt~ to ~1.3.3~ (#238) + * [0.10.0] - 2019-10-08 This is the last ~v0.x~ feature release. The next feature release will be ~v1.x~. diff --git a/README.org b/README.org index fc0b1d7..eca299f 100644 --- a/README.org +++ b/README.org @@ -64,6 +64,10 @@ that can be written to a file. Note, that ~include~ and ~exclude~ are cumulative across all configuration files. +* Caching + +The last modified time for files is used to decide whether to calculate the hash values for the file. If a file has not been updated, then the hash values stored in the `.thorp.cache` file located in the root of the source is used. Otherwise the file will be read to caculate the the new hashes. + * Behaviour When considering a local file, the following table governs what should happen: diff --git a/domain/src/main/scala/net/kemitix/thorp/domain/LocalFile.scala b/domain/src/main/scala/net/kemitix/thorp/domain/LocalFile.scala index fbb9bb8..0b06658 100644 --- a/domain/src/main/scala/net/kemitix/thorp/domain/LocalFile.scala +++ b/domain/src/main/scala/net/kemitix/thorp/domain/LocalFile.scala @@ -8,7 +8,7 @@ import net.kemitix.thorp.domain.Implicits._ final case class LocalFile private ( file: File, source: File, - hashes: Map[HashType, MD5Hash], + hashes: Hashes, remoteKey: RemoteKey, length: Long ) diff --git a/domain/src/main/scala/net/kemitix/thorp/domain/RemoteObjects.scala b/domain/src/main/scala/net/kemitix/thorp/domain/RemoteObjects.scala index b8b590c..76dd6e4 100644 --- a/domain/src/main/scala/net/kemitix/thorp/domain/RemoteObjects.scala +++ b/domain/src/main/scala/net/kemitix/thorp/domain/RemoteObjects.scala @@ -36,7 +36,7 @@ object RemoteObjects { def remoteHasHash( remoteObjects: RemoteObjects, - hashes: Map[HashType, MD5Hash] + hashes: Hashes ): UIO[Option[(RemoteKey, MD5Hash)]] = UIO(remoteObjects.byHash.collectFirst { case (hash, key) if (hashes.values.exists(h => h == hash)) => (key, hash) diff --git a/domain/src/main/scala/net/kemitix/thorp/domain/package.scala b/domain/src/main/scala/net/kemitix/thorp/domain/package.scala new file mode 100644 index 0000000..054b386 --- /dev/null +++ b/domain/src/main/scala/net/kemitix/thorp/domain/package.scala @@ -0,0 +1,8 @@ +package net.kemitix.thorp + +import java.time.Instant + +package object domain { + type Hashes = Map[HashType, MD5Hash] + type LastModified = Instant +} diff --git a/filesystem/src/main/scala/net/kemitix/thorp/filesystem/FileData.scala b/filesystem/src/main/scala/net/kemitix/thorp/filesystem/FileData.scala new file mode 100644 index 0000000..173f0e5 --- /dev/null +++ b/filesystem/src/main/scala/net/kemitix/thorp/filesystem/FileData.scala @@ -0,0 +1,22 @@ +package net.kemitix.thorp.filesystem + +import net.kemitix.thorp.domain.{Hashes, LastModified} + +case class FileData( + hashes: Hashes, + lastModified: LastModified +) { + def +(other: FileData): FileData = { + FileData( + hashes = this.hashes ++ other.hashes, + lastModified = lastModified // discards other.lastModified + ) + } +} + +object FileData { + def create(hashes: Hashes, lastModified: LastModified): FileData = FileData( + hashes = hashes, + lastModified = lastModified + ) +} diff --git a/filesystem/src/main/scala/net/kemitix/thorp/filesystem/FileSystem.scala b/filesystem/src/main/scala/net/kemitix/thorp/filesystem/FileSystem.scala index b13209c..9f783b0 100644 --- a/filesystem/src/main/scala/net/kemitix/thorp/filesystem/FileSystem.scala +++ b/filesystem/src/main/scala/net/kemitix/thorp/filesystem/FileSystem.scala @@ -1,10 +1,11 @@ package net.kemitix.thorp.filesystem -import java.io.{File, FileInputStream} -import java.nio.file.{Files, Path} +import java.io.{File, FileInputStream, FileWriter} +import java.nio.file.{Files, Path, StandardCopyOption} +import java.time.Instant import java.util.stream -import net.kemitix.thorp.domain.{RemoteKey, Sources} +import net.kemitix.thorp.domain.{Hashes, RemoteKey, Sources} import zio._ import scala.jdk.CollectionConverters._ @@ -19,12 +20,19 @@ object FileSystem { def openManagedFileInputStream(file: File, offset: Long) : RIO[FileSystem, ZManaged[Any, Throwable, FileInputStream]] def fileLines(file: File): RIO[FileSystem, Seq[String]] + def appendLines(lines: Iterable[String], file: File): UIO[Unit] def isDirectory(file: File): RIO[FileSystem, Boolean] def listFiles(path: Path): UIO[List[File]] + def listDirs(path: Path): UIO[List[Path]] def length(file: File): ZIO[FileSystem, Nothing, Long] + def lastModified(file: File): UIO[Instant] def hasLocalFile(sources: Sources, prefix: RemoteKey, remoteKey: RemoteKey): ZIO[FileSystem, Nothing, Boolean] + def findCache( + directory: Path): ZIO[FileSystem with Hasher, Nothing, PathCache] + def getHashes(path: Path, fileData: FileData): ZIO[FileSystem, Any, Hashes] + def moveFile(source: Path, target: Path): UIO[Unit] } trait Live extends FileSystem { override val filesystem: Service = new Service { @@ -59,12 +67,28 @@ object FileSystem { Task(file.isDirectory) override def listFiles(path: Path): UIO[List[File]] = - Task(List.from(path.toFile.listFiles())) - .catchAll(_ => UIO.succeed(List.empty[File])) + Task { + List + .from(path.toFile.listFiles()) + .filterNot(_.isDirectory) + .filterNot(_.getName.contentEquals(PathCache.fileName)) + .filterNot(_.getName.contentEquals(PathCache.tempFileName)) + }.catchAll(_ => UIO.succeed(List.empty[File])) + + override def listDirs(path: Path): UIO[List[Path]] = + Task( + List + .from(path.toFile.listFiles()) + .filter(_.isDirectory) + .map(_.toPath)) + .catchAll(_ => UIO.succeed(List.empty[Path])) override def length(file: File): ZIO[FileSystem, Nothing, Long] = UIO(file.length) + override def lastModified(file: File): UIO[Instant] = + UIO(Instant.ofEpochMilli(file.lastModified())) + override def hasLocalFile( sources: Sources, prefix: RemoteKey, @@ -77,6 +101,40 @@ object FileSystem { .map(_ || accExists) } } + + override def findCache( + directory: Path): ZIO[FileSystem with Hasher, Nothing, PathCache] = + for { + cacheFile <- UIO(directory.resolve(PathCache.fileName).toFile) + lines <- fileLines(cacheFile).catchAll(_ => UIO(List.empty)) + cache <- PathCache.fromLines(lines) + } yield cache + + override def getHashes( + path: Path, + fileData: FileData): ZIO[FileSystem, Any, Hashes] = { + val lastModified = Instant.ofEpochMilli(path.toFile.lastModified()) + if (lastModified.isAfter(fileData.lastModified)) { + ZIO.fail("fileData is out-of-date") + } else { + ZIO.succeed(fileData.hashes) + } + } + + override def appendLines(lines: Iterable[String], file: File): UIO[Unit] = + UIO.bracket(UIO(new FileWriter(file, true)))(fw => UIO(fw.close()))( + fw => + UIO { + lines.map(line => fw.append(line + System.lineSeparator())) + }) + + override def moveFile(source: Path, target: Path): UIO[Unit] = + IO { + if (source.toFile.exists()) { + Files.move(source, target, StandardCopyOption.ATOMIC_MOVE) + } + () + }.catchAll(_ => UIO.unit) } } object Live extends Live @@ -86,9 +144,13 @@ object FileSystem { val fileLinesResult: Task[List[String]] val isDirResult: Task[Boolean] val listFilesResult: UIO[List[File]] + val listDirsResult: UIO[List[Path]] val lengthResult: UIO[Long] + val lastModifiedResult: UIO[Instant] val managedFileInputStream: Task[ZManaged[Any, Throwable, FileInputStream]] val hasLocalFileResult: UIO[Boolean] + val pathCacheResult: UIO[PathCache] + val matchesResult: IO[Any, Hashes] override val filesystem: Service = new Service { @@ -108,14 +170,33 @@ object FileSystem { override def listFiles(path: Path): UIO[List[File]] = listFilesResult + override def listDirs(path: Path): UIO[List[Path]] = + listDirsResult + override def length(file: File): UIO[Long] = lengthResult + override def lastModified(file: File): UIO[Instant] = + lastModifiedResult + override def hasLocalFile( sources: Sources, prefix: RemoteKey, remoteKey: RemoteKey): ZIO[FileSystem, Nothing, Boolean] = hasLocalFileResult + + override def findCache(directory: Path): UIO[PathCache] = + pathCacheResult + + override def getHashes(path: Path, + fileData: FileData): ZIO[FileSystem, Any, Hashes] = + matchesResult + + override def appendLines(lines: Iterable[String], file: File): UIO[Unit] = + UIO.unit + + override def moveFile(source: Path, target: Path): UIO[Unit] = + UIO.unit } } @@ -136,9 +217,18 @@ object FileSystem { final def isDirectory(file: File): RIO[FileSystem, Boolean] = ZIO.accessM(_.filesystem.isDirectory(file)) + /** + * Lists only files within the Path. + */ final def listFiles(path: Path): ZIO[FileSystem, Nothing, List[File]] = ZIO.accessM(_.filesystem.listFiles(path)) + /** + * Lists only sub-directories within the Path. + */ + final def listDirs(path: Path): ZIO[FileSystem, Nothing, List[Path]] = + ZIO.accessM(_.filesystem.listDirs(path)) + final def length(file: File): ZIO[FileSystem, Nothing, Long] = ZIO.accessM(_.filesystem.length(file)) @@ -147,4 +237,26 @@ object FileSystem { prefix: RemoteKey, remoteKey: RemoteKey): ZIO[FileSystem, Nothing, Boolean] = ZIO.accessM(_.filesystem.hasLocalFile(sources, prefix, remoteKey)) + + final def findCache( + directory: Path): ZIO[FileSystem with Hasher, Nothing, PathCache] = + ZIO.accessM(_.filesystem.findCache(directory)) + + final def getHashes(path: Path, + fileData: FileData): ZIO[FileSystem, Any, Hashes] = + ZIO.accessM(_.filesystem.getHashes(path, fileData)) + + final def lastModified(file: File): ZIO[FileSystem, Nothing, Instant] = + ZIO.accessM(_.filesystem.lastModified(file)) + + final def appendLines(lines: Iterable[String], + file: File): ZIO[FileSystem, Nothing, Unit] = + ZIO.accessM(_.filesystem.appendLines(lines, file)) + + final def moveFile( + source: Path, + target: Path + ): ZIO[FileSystem, Nothing, Unit] = + ZIO.accessM(_.filesystem.moveFile(source, target)) + } diff --git a/filesystem/src/main/scala/net/kemitix/thorp/filesystem/Hasher.scala b/filesystem/src/main/scala/net/kemitix/thorp/filesystem/Hasher.scala index cdb0d3e..6d2f575 100644 --- a/filesystem/src/main/scala/net/kemitix/thorp/filesystem/Hasher.scala +++ b/filesystem/src/main/scala/net/kemitix/thorp/filesystem/Hasher.scala @@ -4,7 +4,7 @@ import java.nio.file.Path import java.util.concurrent.atomic.AtomicReference import net.kemitix.thorp.domain.HashType.MD5 -import net.kemitix.thorp.domain.{HashType, MD5Hash} +import net.kemitix.thorp.domain.{HashType, Hashes} import zio.{RIO, ZIO} /** @@ -15,27 +15,33 @@ trait Hasher { } object Hasher { trait Service { + def typeFrom(str: String): ZIO[Hasher, IllegalArgumentException, HashType] + def hashObject( - path: Path): RIO[Hasher with FileSystem, Map[HashType, MD5Hash]] - def hashObjectChunk( path: Path, - chunkNumber: Long, - chunkSize: Long): RIO[Hasher with FileSystem, Map[HashType, MD5Hash]] + cachedFileData: Option[FileData]): RIO[Hasher with FileSystem, Hashes] + def hashObjectChunk(path: Path, + chunkNumber: Long, + chunkSize: Long): RIO[Hasher with FileSystem, Hashes] def hex(in: Array[Byte]): RIO[Hasher, String] def digest(in: String): RIO[Hasher, Array[Byte]] } trait Live extends Hasher { val hasher: Service = new Service { override def hashObject( - path: Path): RIO[FileSystem, Map[HashType, MD5Hash]] = - for { - md5 <- MD5HashGenerator.md5File(path) - } yield Map(MD5 -> md5) + path: Path, + cachedFileData: Option[FileData]): RIO[FileSystem, Hashes] = + ZIO + .fromOption(cachedFileData) + .flatMap(fileData => FileSystem.getHashes(path, fileData)) + .orElse(for { + md5 <- MD5HashGenerator.md5File(path) + } yield Map(MD5 -> md5)) - override def hashObjectChunk(path: Path, - chunkNumber: Long, - chunkSize: Long) - : RIO[Hasher with FileSystem, Map[HashType, MD5Hash]] = + override def hashObjectChunk( + path: Path, + chunkNumber: Long, + chunkSize: Long): RIO[Hasher with FileSystem, Hashes] = for { md5 <- MD5HashGenerator.md5FileChunk(path, chunkNumber * chunkSize, @@ -47,25 +53,33 @@ object Hasher { override def digest(in: String): RIO[Hasher, Array[Byte]] = ZIO(MD5HashGenerator.digest(in)) + + override def typeFrom( + str: String): ZIO[Hasher, IllegalArgumentException, HashType] = + if (str.contentEquals("MD5")) { + ZIO.succeed(MD5) + } else { + ZIO.fail( + new IllegalArgumentException("Unknown Hash Type: %s".format(str))) + } } } object Live extends Live trait Test extends Hasher { - val hashes: AtomicReference[Map[Path, Map[HashType, MD5Hash]]] = + val hashes: AtomicReference[Map[Path, Hashes]] = new AtomicReference(Map.empty) - val hashChunks - : AtomicReference[Map[Path, Map[Long, Map[HashType, MD5Hash]]]] = + val hashChunks: AtomicReference[Map[Path, Map[Long, Hashes]]] = new AtomicReference(Map.empty) val hasher: Service = new Service { - override def hashObject( - path: Path): RIO[Hasher with FileSystem, Map[HashType, MD5Hash]] = + override def hashObject(path: Path, cachedFileData: Option[FileData]) + : RIO[Hasher with FileSystem, Hashes] = ZIO(hashes.get()(path)) - override def hashObjectChunk(path: Path, - chunkNumber: Long, - chunkSize: Long) - : RIO[Hasher with FileSystem, Map[HashType, MD5Hash]] = + override def hashObjectChunk( + path: Path, + chunkNumber: Long, + chunkSize: Long): RIO[Hasher with FileSystem, Hashes] = ZIO(hashChunks.get()(path)(chunkNumber)) override def hex(in: Array[Byte]): RIO[Hasher, String] = @@ -73,18 +87,23 @@ object Hasher { override def digest(in: String): RIO[Hasher, Array[Byte]] = ZIO(MD5HashGenerator.digest(in)) + + override def typeFrom( + str: String): ZIO[Hasher, IllegalArgumentException, HashType] = + Live.hasher.typeFrom(str) } } object Test extends Test final def hashObject( - path: Path): RIO[Hasher with FileSystem, Map[HashType, MD5Hash]] = - ZIO.accessM(_.hasher hashObject path) + path: Path, + cachedFileData: Option[FileData]): RIO[Hasher with FileSystem, Hashes] = + ZIO.accessM(_.hasher.hashObject(path, cachedFileData)) final def hashObjectChunk( path: Path, chunkNumber: Long, - chunkSize: Long): RIO[Hasher with FileSystem, Map[HashType, MD5Hash]] = + chunkSize: Long): RIO[Hasher with FileSystem, Hashes] = ZIO.accessM(_.hasher hashObjectChunk (path, chunkNumber, chunkSize)) final def hex(in: Array[Byte]): RIO[Hasher, String] = @@ -92,4 +111,9 @@ object Hasher { final def digest(in: String): RIO[Hasher, Array[Byte]] = ZIO.accessM(_.hasher digest in) + + final def typeFrom( + str: String): ZIO[Hasher, IllegalArgumentException, HashType] = + ZIO.accessM(_.hasher.typeFrom(str)) + } diff --git a/filesystem/src/main/scala/net/kemitix/thorp/filesystem/PathCache.scala b/filesystem/src/main/scala/net/kemitix/thorp/filesystem/PathCache.scala new file mode 100644 index 0000000..f28993f --- /dev/null +++ b/filesystem/src/main/scala/net/kemitix/thorp/filesystem/PathCache.scala @@ -0,0 +1,74 @@ +package net.kemitix.thorp.filesystem + +import java.nio.file.{Path, Paths} +import java.time.Instant +import java.util.regex.Pattern + +import net.kemitix.thorp.domain.{HashType, MD5Hash} +import zio.{UIO, ZIO} + +/** + * Meta data for files in the current source, as of the last time Thorp processed this directory. + * + *
N.B. Does not include sub-directories.
+ */ +final case class PathCache( + data: PathCache.Data +) { + def get(path: Path): Option[FileData] = data.get(path) +} + +object PathCache { + type Data = Map[Path, FileData] + val fileName = ".thorp.cache" + val tempFileName = ".thorp.cache.tmp" + + def create(path: Path, fileData: FileData): UIO[Iterable[String]] = + UIO { + fileData.hashes.keys.map(hashType => { + val hash = fileData.hashes(hashType) + val modified = fileData.lastModified + String.join(":", + hashType.toString, + hash.in, + modified.toEpochMilli.toString, + path.toString) + }) + } + + private val pattern = + "^(?