From eb541bc7d487540835260164c1ebeff0060d5ff7 Mon Sep 17 00:00:00 2001
From: Monica Sarbu <monica@elastic.co>
Date: Thu, 17 Dec 2015 21:06:39 +0100
Subject: [PATCH] Add exclude_files configuration option

You can define a list of regular expressions to match the ignored files. This option is useful
in case you want to include all the log files with the exception of the .gz files. For this particular
case, you can use exclude_files: ['.gz$'].
---
 CHANGELOG.asciidoc                           |  1 +
 filebeat/config/config.go                    |  3 ++
 filebeat/crawler/prospector.go               | 25 ++++++++++++++
 filebeat/crawler/prospector_test.go          | 20 +++++++++++
 filebeat/docs/configuration.asciidoc         | 10 ++++++
 filebeat/etc/beat.yml                        |  4 +++
 filebeat/etc/filebeat.yml                    |  4 +++
 filebeat/harvester/log.go                    | 30 ----------------
 filebeat/harvester/util.go                   | 33 +++++++++++++++++-
 filebeat/harvester/util_test.go              | 19 +++++++++++
 filebeat/tests/system/config/filebeat.yml.j2 |  5 ++-
 filebeat/tests/system/test_prospector.py     | 36 +++++++++++++++++++-
 12 files changed, 157 insertions(+), 33 deletions(-)
 create mode 100644 filebeat/harvester/util_test.go

diff --git a/CHANGELOG.asciidoc b/CHANGELOG.asciidoc
index d6875cc5c2b..6b4ef7f0305 100644
--- a/CHANGELOG.asciidoc
+++ b/CHANGELOG.asciidoc
@@ -27,6 +27,7 @@ https://github.com/elastic/beats/compare/1.0.0...master[Check the HEAD diff]
 *Topbeat*
 
 *Filebeat*
+- Add exclude_files configuration option {pull}563[563]
 
 *Winlogbeat*
 
diff --git a/filebeat/config/config.go b/filebeat/config/config.go
index 5174d8f793e..db45ea305b2 100644
--- a/filebeat/config/config.go
+++ b/filebeat/config/config.go
@@ -4,6 +4,7 @@ import (
 	"log"
 	"os"
 	"path/filepath"
+	"regexp"
 	"time"
 
 	"github.com/elastic/beats/libbeat/cfgfile"
@@ -49,6 +50,8 @@ type ProspectorConfig struct {
 	ScanFrequency         string `yaml:"scan_frequency"`
 	ScanFrequencyDuration time.Duration
 	Harvester             HarvesterConfig `yaml:",inline"`
+	ExcludeFiles          []string        `yaml:"exclude_files"`
+	ExcludeFilesRegexp    []*regexp.Regexp
 }
 
 type HarvesterConfig struct {
diff --git a/filebeat/crawler/prospector.go b/filebeat/crawler/prospector.go
index b92eae18995..dfadc0d3f43 100644
--- a/filebeat/crawler/prospector.go
+++ b/filebeat/crawler/prospector.go
@@ -52,6 +52,10 @@ func (p *Prospector) setupProspectorConfig() error {
 	if err != nil {
 		return err
 	}
+	config.ExcludeFilesRegexp, err = harvester.InitRegexps(config.ExcludeFiles)
+	if err != nil {
+		return err
+	}
 
 	// Init File Stat list
 	p.prospectorList = make(map[string]harvester.FileStat)
@@ -219,11 +223,26 @@ func (p *Prospector) stdinRun(spoolChan chan *input.FileEvent) {
 	}
 }
 
+func (p *Prospector) isFileExcluded(file string) bool {
+
+	config := &p.ProspectorConfig
+
+	if len(config.ExcludeFilesRegexp) > 0 {
+
+		if harvester.MatchAnyRegexps(config.ExcludeFilesRegexp, file) {
+			return true
+		}
+	}
+
+	return false
+}
+
 // Scans the specific path which can be a glob (/**/**/*.log)
 // For all found files it is checked if a harvester should be started
 func (p *Prospector) scan(path string, output chan *input.FileEvent) {
 
 	logp.Debug("prospector", "scan path %s", path)
+	logp.Debug("prospector", "exclude_files: %s", p.ProspectorConfig.ExcludeFiles)
 	// Evaluate the path as a wildcards/shell glob
 	matches, err := filepath.Glob(path)
 	if err != nil {
@@ -237,6 +256,12 @@ func (p *Prospector) scan(path string, output chan *input.FileEvent) {
 	for _, file := range matches {
 		logp.Debug("prospector", "Check file for harvesting: %s", file)
 
+		// check if the file is in the exclude_files list
+		if p.isFileExcluded(file) {
+			logp.Debug("prospector", "Exclude file: %s", file)
+			continue
+		}
+
 		// Stat the file, following any symlinks.
 		fileinfo, err := os.Stat(file)
 
diff --git a/filebeat/crawler/prospector_test.go b/filebeat/crawler/prospector_test.go
index e186908b5e0..8ffe9a733f7 100644
--- a/filebeat/crawler/prospector_test.go
+++ b/filebeat/crawler/prospector_test.go
@@ -171,3 +171,23 @@ func TestProspectorInitInputTypeWrong(t *testing.T) {
 	assert.Nil(t, err)
 	assert.Equal(t, "log", prospector.ProspectorConfig.Harvester.InputType)
 }
+
+func TestProspectorFileExclude(t *testing.T) {
+
+	prospectorConfig := config.ProspectorConfig{
+		ExcludeFiles: []string{"\\.gz$"},
+		Harvester: config.HarvesterConfig{
+			BufferSize: 0,
+		},
+	}
+
+	prospector := Prospector{
+		ProspectorConfig: prospectorConfig,
+	}
+
+	prospector.Init()
+
+	assert.True(t, prospector.isFileExcluded("/tmp/log/logw.gz"))
+	assert.False(t, prospector.isFileExcluded("/tmp/log/logw.log"))
+
+}
diff --git a/filebeat/docs/configuration.asciidoc b/filebeat/docs/configuration.asciidoc
index 6ee7c7b1156..521e1debe25 100644
--- a/filebeat/docs/configuration.asciidoc
+++ b/filebeat/docs/configuration.asciidoc
@@ -91,6 +91,16 @@ If both `include_lines` and `exclude_lines` are defined, then include_lines is c
  exclude_lines: ["^DBG"]
 -------------------------------------------------------------------------------------
 
+===== exclude_files
+
+A list of regular expressions to match the files to be ignored. By default no file is excluded. 
+
+[source,yaml]
+-------------------------------------------------------------------------------------
+  exclude_files: [".gz$"]
+-------------------------------------------------------------------------------------
+To ignore all the files with the `gz` extension.
+
 [[configuration-fields]]
 ===== fields
 
diff --git a/filebeat/etc/beat.yml b/filebeat/etc/beat.yml
index de8de4cedc6..64821d7da29 100644
--- a/filebeat/etc/beat.yml
+++ b/filebeat/etc/beat.yml
@@ -40,6 +40,10 @@ filebeat:
       # exclude_lines. By default, all the lines are exported.
       # include_lines: ["^ERR", "^WARN"]
 
+      # Exclude files. A list of regular expressions to match. Filebeat drops the files that
+      # are matching any regular expression from the list. By default, no files are dropped.
+      # exclude_files: [".gz$"]
+
       # Optional additional fields. These field can be freely picked
       # to add additional information to the crawled log files for filtering
       #fields:
diff --git a/filebeat/etc/filebeat.yml b/filebeat/etc/filebeat.yml
index 8139278c161..34c3dc5e824 100644
--- a/filebeat/etc/filebeat.yml
+++ b/filebeat/etc/filebeat.yml
@@ -40,6 +40,10 @@ filebeat:
       # exclude_lines. By default, all the lines are exported.
       # include_lines: ["^ERR", "^WARN"]
 
+      # Exclude files. A list of regular expressions to match. Filebeat drops the files that
+      # are matching any regular expression from the list. By default, no files are dropped.
+      # exclude_files: [".gz$"]
+
       # Optional additional fields. These field can be freely picked
       # to add additional information to the crawled log files for filtering
       #fields:
diff --git a/filebeat/harvester/log.go b/filebeat/harvester/log.go
index ab1699eddca..e7efa22a3e4 100644
--- a/filebeat/harvester/log.go
+++ b/filebeat/harvester/log.go
@@ -5,7 +5,6 @@ import (
 	"fmt"
 	"io"
 	"os"
-	"regexp"
 	"time"
 
 	"github.com/elastic/beats/filebeat/config"
@@ -332,35 +331,6 @@ func (h *Harvester) handleReadlineError(lastTimeRead time.Time, err error) error
 func (h *Harvester) Stop() {
 }
 
-func InitRegexps(exprs []string) ([]*regexp.Regexp, error) {
-
-	result := []*regexp.Regexp{}
-
-	for _, exp := range exprs {
-
-		rexp, err := regexp.CompilePOSIX(exp)
-		if err != nil {
-			logp.Err("Fail to compile the regexp %s: %s", exp, err)
-			return nil, err
-		}
-		result = append(result, rexp)
-	}
-	return result, nil
-}
-
-func MatchAnyRegexps(regexps []*regexp.Regexp, text string) bool {
-
-	for _, rexp := range regexps {
-		if rexp.MatchString(text) {
-			// drop line
-			return true
-
-		}
-	}
-
-	return false
-}
-
 const maxConsecutiveEmptyReads = 100
 
 // timedReader keeps track of last time bytes have been read from underlying
diff --git a/filebeat/harvester/util.go b/filebeat/harvester/util.go
index 54b0c871588..2bc9d137f48 100644
--- a/filebeat/harvester/util.go
+++ b/filebeat/harvester/util.go
@@ -1,9 +1,11 @@
 package harvester
 
 import (
+	"regexp"
+	"time"
+
 	"github.com/elastic/beats/filebeat/harvester/encoding"
 	"github.com/elastic/beats/libbeat/logp"
-	"time"
 )
 
 // isLine checks if the given byte array is a line, means has a line ending \n
@@ -60,3 +62,32 @@ func readlineString(bytes []byte, size int) (string, int, error) {
 	s := string(bytes)[:len(bytes)-lineEndingChars(bytes)]
 	return s, size, nil
 }
+
+// InitRegexps initializes a list of compiled regular expressions.
+func InitRegexps(exprs []string) ([]*regexp.Regexp, error) {
+
+	result := []*regexp.Regexp{}
+
+	for _, exp := range exprs {
+
+		rexp, err := regexp.CompilePOSIX(exp)
+		if err != nil {
+			logp.Err("Fail to compile the regexp %s: %s", exp, err)
+			return nil, err
+		}
+		result = append(result, rexp)
+	}
+	return result, nil
+}
+
+// MatchAnyRegexps checks if the text matches any of the regular expressions
+func MatchAnyRegexps(regexps []*regexp.Regexp, text string) bool {
+
+	for _, rexp := range regexps {
+		if rexp.MatchString(text) {
+			return true
+		}
+	}
+
+	return false
+}
diff --git a/filebeat/harvester/util_test.go b/filebeat/harvester/util_test.go
new file mode 100644
index 00000000000..00364531db9
--- /dev/null
+++ b/filebeat/harvester/util_test.go
@@ -0,0 +1,19 @@
+package harvester
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestMatchAnyRegexps(t *testing.T) {
+
+	patterns := []string{"\\.gz$"}
+
+	regexps, err := InitRegexps(patterns)
+
+	assert.Nil(t, err)
+
+	assert.Equal(t, MatchAnyRegexps(regexps, "/var/log/log.gz"), true)
+
+}
diff --git a/filebeat/tests/system/config/filebeat.yml.j2 b/filebeat/tests/system/config/filebeat.yml.j2
index f1df92b6b3f..69ea994aa65 100644
--- a/filebeat/tests/system/config/filebeat.yml.j2
+++ b/filebeat/tests/system/config/filebeat.yml.j2
@@ -29,7 +29,10 @@ filebeat:
       {% if exclude_lines %}
       exclude_lines: {{exclude_lines}}
       {% endif %}
-      
+      {% if exclude_files %}
+      exclude_files: {{exclude_files}}
+      {% endif %}
+     
   spool_size:
   idle_timeout: 0.1s
   registry_file: {{ fb.working_dir + '/' }}{{ registryFile|default(".filebeat")}}
diff --git a/filebeat/tests/system/test_prospector.py b/filebeat/tests/system/test_prospector.py
index dbdbfe6305e..efd862b649d 100644
--- a/filebeat/tests/system/test_prospector.py
+++ b/filebeat/tests/system/test_prospector.py
@@ -105,7 +105,6 @@ def test_stdin(self):
             lambda: self.output_has(lines=iterations1+iterations2),
             max_timeout=15)
 
-
         proc.kill_and_wait()
 
         objs = self.read_output()
@@ -140,3 +139,38 @@ def test_rotating_ignore_older_larger_write_rate(self):
             max_timeout=15)
 
         proc.kill_and_wait()
+
+    def test_exclude_files(self):
+
+        self.render_config_template(
+            path=os.path.abspath(self.working_dir) + "/log/*",
+            exclude_files=[".gz$"]
+        )
+        os.mkdir(self.working_dir + "/log/")
+
+        testfile = self.working_dir + "/log/test.gz"
+        file = open(testfile, 'w')
+        file.write("line in gz file\n")
+        file.close()
+
+        testfile = self.working_dir + "/log/test.log"
+        file = open(testfile, 'w')
+        file.write("line in log file\n")
+        file.close()
+
+        filebeat = self.start_filebeat()
+
+        self.wait_until(
+            lambda: self.output_has(lines=1),
+            max_timeout=15)
+
+        # TODO: Find better solution when filebeat did crawl the file
+        # Idea: Special flag to filebeat so that filebeat is only doing and
+        # crawl and then finishes
+        filebeat.kill_and_wait()
+
+        output = self.read_output()
+
+        # Check that output file has the same number of lines as the log file
+        assert 1 == len(output)
+        assert output[0]["message"] == "line in log file"