summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTimofey Titovets <nefelim4ag@gmail.com>2017-09-28 16:33:40 +0200
committerDavid Sterba <dsterba@suse.com>2017-11-01 20:45:36 +0100
commita288e92cacdc4729ad8f83d018fb0f3f5ded6c37 (patch)
treed305a7ac88dfe4507782d0b7f92f244087c36fb5
parentBtrfs: heuristic: add detection of repeated data patterns (diff)
downloadlinux-a288e92cacdc4729ad8f83d018fb0f3f5ded6c37.tar.xz
linux-a288e92cacdc4729ad8f83d018fb0f3f5ded6c37.zip
Btrfs: heuristic: add byte set calculation
Calculate byte set size for data sample: - calculate how many unique bytes have been in the sample - for all bytes count > 0, check if we're still in the low count range (~25%), such data are easily compressible, otherwise furhter analysis is needed Signed-off-by: Timofey Titovets <nefelim4ag@gmail.com> Reviewed-by: David Sterba <dsterba@suse.com> [ update comments ] Signed-off-by: David Sterba <dsterba@suse.com>
-rw-r--r--fs/btrfs/compression.c45
1 files changed, 45 insertions, 0 deletions
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 0d445c815ca2..e949f078a81b 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -1222,6 +1222,45 @@ int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start,
return 1;
}
+/*
+ * Count byte values in buckets.
+ * This heuristic can detect textual data (configs, xml, json, html, etc).
+ * Because in most text-like data byte set is restricted to limited number of
+ * possible characters, and that restriction in most cases makes data easy to
+ * compress.
+ *
+ * @BYTE_SET_THRESHOLD - consider all data within this byte set size:
+ * less - compressible
+ * more - need additional analysis
+ */
+#define BYTE_SET_THRESHOLD (64)
+
+static u32 byte_set_size(const struct heuristic_ws *ws)
+{
+ u32 i;
+ u32 byte_set_size = 0;
+
+ for (i = 0; i < BYTE_SET_THRESHOLD; i++) {
+ if (ws->bucket[i].count > 0)
+ byte_set_size++;
+ }
+
+ /*
+ * Continue collecting count of byte values in buckets. If the byte
+ * set size is bigger then the threshold, it's pointless to continue,
+ * the detection technique would fail for this type of data.
+ */
+ for (; i < BUCKET_SIZE; i++) {
+ if (ws->bucket[i].count > 0) {
+ byte_set_size++;
+ if (byte_set_size > BYTE_SET_THRESHOLD)
+ return byte_set_size;
+ }
+ }
+
+ return byte_set_size;
+}
+
static bool sample_repeated_patterns(struct heuristic_ws *ws)
{
const u32 half_of_sample = ws->sample_size / 2;
@@ -1321,6 +1360,12 @@ int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end)
ws->bucket[byte].count++;
}
+ i = byte_set_size(ws);
+ if (i < BYTE_SET_THRESHOLD) {
+ ret = 2;
+ goto out;
+ }
+
out:
__free_workspace(0, ws_list, true);
return ret;