From b9bc7dc95760080a4374b15e85aded71c0a03cd3 Mon Sep 17 00:00:00 2001 From: Paul Campbell Date: Tue, 28 May 2019 12:24:09 +0100 Subject: [PATCH] Add filter to select files to be synced (#24) * [Filter] added * [Config] Add filters field * [ParseArgs] Add '-f'/'--filter' parameters * [LocalFileStream] apply filters * [SyncLogging] show filter(s) * [LocalFileStream] Don't apply filter to directories The filter may match on a file within a directory, but if the filter fails on the directory alone, then we weren't recursing into the directory at all. --- .../scala/net/kemitix/s3thorp/Config.scala | 1 + .../scala/net/kemitix/s3thorp/Filter.scala | 16 +++++++ .../net/kemitix/s3thorp/LocalFileStream.scala | 4 +- .../scala/net/kemitix/s3thorp/ParseArgs.scala | 3 ++ .../net/kemitix/s3thorp/SyncLogging.scala | 1 + .../net/kemitix/s3thorp/FilterSuite.scala | 44 +++++++++++++++++++ 6 files changed, 68 insertions(+), 1 deletion(-) create mode 100644 src/main/scala/net/kemitix/s3thorp/Filter.scala create mode 100644 src/test/scala/net/kemitix/s3thorp/FilterSuite.scala diff --git a/src/main/scala/net/kemitix/s3thorp/Config.scala b/src/main/scala/net/kemitix/s3thorp/Config.scala index 2f61aa5..b86a096 100644 --- a/src/main/scala/net/kemitix/s3thorp/Config.scala +++ b/src/main/scala/net/kemitix/s3thorp/Config.scala @@ -5,6 +5,7 @@ import java.io.File case class Config(bucket: Bucket = Bucket(""), prefix: RemoteKey = RemoteKey(""), verbose: Int = 1, + filters: Seq[Filter] = List(), excludes: Seq[Exclude] = List(), multiPartThreshold: Long = 1024 * 1024 * 5, maxRetries: Int = 3, diff --git a/src/main/scala/net/kemitix/s3thorp/Filter.scala b/src/main/scala/net/kemitix/s3thorp/Filter.scala new file mode 100644 index 0000000..17ea823 --- /dev/null +++ b/src/main/scala/net/kemitix/s3thorp/Filter.scala @@ -0,0 +1,16 @@ +package net.kemitix.s3thorp + +import java.nio.file.Path +import java.util.function.Predicate +import java.util.regex.Pattern + +case class Filter(filter: String = ".*") { + + lazy val predicate: Predicate[String] = Pattern.compile(filter).asPredicate.negate + + def isIncluded(path: Path): Boolean = !isExcluded(path) + + def isExcluded(path: Path): Boolean = predicate.test(path.toString) + +} + diff --git a/src/main/scala/net/kemitix/s3thorp/LocalFileStream.scala b/src/main/scala/net/kemitix/s3thorp/LocalFileStream.scala index 5aa4665..bc91a37 100644 --- a/src/main/scala/net/kemitix/s3thorp/LocalFileStream.scala +++ b/src/main/scala/net/kemitix/s3thorp/LocalFileStream.scala @@ -10,7 +10,9 @@ trait LocalFileStream (implicit c: Config): Stream[LocalFile] = { log5(s"- Entering: $file") val files = for { - f <- dirPaths(file) filter { f => c.excludes.forall { filter => filter isIncluded f.toPath } } + f <- dirPaths(file) + .filter { f => f.isDirectory || c.filters.forall { filter => filter isIncluded f.toPath } } + .filter { f => c.excludes.forall { exclude => exclude isIncluded f.toPath } } fs <- recurseIntoSubDirectories(f) } yield fs log5(s"- Leaving: $file") diff --git a/src/main/scala/net/kemitix/s3thorp/ParseArgs.scala b/src/main/scala/net/kemitix/s3thorp/ParseArgs.scala index 052a8dd..755b99f 100644 --- a/src/main/scala/net/kemitix/s3thorp/ParseArgs.scala +++ b/src/main/scala/net/kemitix/s3thorp/ParseArgs.scala @@ -25,6 +25,9 @@ object ParseArgs { opt[String]('p', "prefix") .action((str, c) => c.copy(prefix = RemoteKey(str))) .text("Prefix within the S3 Bucket"), + opt[Seq[String]]('f', "filter") + .action((str, c) => c.copy(filters = str.map(Filter))) + .text("Filter only matching paths"), opt[Seq[String]]('x', "exclude") .action((str,c) => c.copy(excludes = str.map(Exclude))) .text("Exclude matching paths"), diff --git a/src/main/scala/net/kemitix/s3thorp/SyncLogging.scala b/src/main/scala/net/kemitix/s3thorp/SyncLogging.scala index b8758b4..2f4bc85 100644 --- a/src/main/scala/net/kemitix/s3thorp/SyncLogging.scala +++ b/src/main/scala/net/kemitix/s3thorp/SyncLogging.scala @@ -7,6 +7,7 @@ trait SyncLogging extends Logging { def logRunStart(implicit c: Config): Unit = log1(s"Bucket: ${c.bucket.name}, Prefix: ${c.prefix.key}, Source: ${c.source}, " + + s"Filter: ${c.filters.map{ f => f.filter}.mkString(""", """)} " + s"Exclude: ${c.excludes.map{ f => f.exclude}.mkString(""", """)}")(c) def logFileScan(implicit c: Config): Unit = diff --git a/src/test/scala/net/kemitix/s3thorp/FilterSuite.scala b/src/test/scala/net/kemitix/s3thorp/FilterSuite.scala new file mode 100644 index 0000000..8ea491f --- /dev/null +++ b/src/test/scala/net/kemitix/s3thorp/FilterSuite.scala @@ -0,0 +1,44 @@ +package net.kemitix.s3thorp + +import java.nio.file.{Path, Paths} + +class FilterSuite extends UnitTest { + + describe("default filter") { + val filter = Filter() + val paths: List[Path] = List("/a-file", "a-file", "path/to/a/file", "/path/to/a/file", + "/home/pcampbell/repos/kemitix/s3thorp/target/scala-2.12/test-classes/net/kemitix/s3thorp/upload/subdir" + ) map { p => Paths.get(p)} + it("should not exclude files") { + paths.foreach(path => { assertResult(false)(filter.isExcluded(path)) }) + } + it("should include files") { + paths.foreach(path => assertResult(true)(filter.isIncluded(path))) + } + } + describe("directory exact match include '/upload/subdir/'") { + val filter = Filter("/upload/subdir/") + it("include matching directory") { + val matching = Paths.get("/upload/subdir/leaf-file") + assertResult(true)(filter.isIncluded(matching)) + } + it("exclude non-matching files") { + val nonMatching = Paths.get("/upload/other-file") + assertResult(true)(filter.isExcluded(nonMatching)) + } + } + describe("file partial match 'root'") { + val filter = Filter("root") + it("include matching file '/upload/root-file") { + val matching = Paths.get("/upload/root-file") + assertResult(true)(filter.isIncluded(matching)) + } + it("exclude non-matching files 'test-file-for-hash.txt' & '/upload/subdir/leaf-file'") { + val nonMatching1 = Paths.get("/test-file-for-hash.txt") + val nonMatching2 = Paths.get("/upload/subdir/leaf-file") + assertResult(true)(filter.isExcluded(nonMatching1)) + assertResult(true)(filter.isExcluded(nonMatching2)) + } + } + +}