From eb541bc7d487540835260164c1ebeff0060d5ff7 Mon Sep 17 00:00:00 2001 From: Monica Sarbu Date: Thu, 17 Dec 2015 21:06:39 +0100 Subject: [PATCH] Add exclude_files configuration option You can define a list of regular expressions to match the ignored files. This option is useful in case you want to include all the log files with the exception of the .gz files. For this particular case, you can use exclude_files: ['.gz$']. --- CHANGELOG.asciidoc | 1 + filebeat/config/config.go | 3 ++ filebeat/crawler/prospector.go | 25 ++++++++++++++ filebeat/crawler/prospector_test.go | 20 +++++++++++ filebeat/docs/configuration.asciidoc | 10 ++++++ filebeat/etc/beat.yml | 4 +++ filebeat/etc/filebeat.yml | 4 +++ filebeat/harvester/log.go | 30 ---------------- filebeat/harvester/util.go | 33 +++++++++++++++++- filebeat/harvester/util_test.go | 19 +++++++++++ filebeat/tests/system/config/filebeat.yml.j2 | 5 ++- filebeat/tests/system/test_prospector.py | 36 +++++++++++++++++++- 12 files changed, 157 insertions(+), 33 deletions(-) create mode 100644 filebeat/harvester/util_test.go diff --git a/CHANGELOG.asciidoc b/CHANGELOG.asciidoc index d6875cc5c2ba..6b4ef7f03050 100644 --- a/CHANGELOG.asciidoc +++ b/CHANGELOG.asciidoc @@ -27,6 +27,7 @@ https://github.com/elastic/beats/compare/1.0.0...master[Check the HEAD diff] *Topbeat* *Filebeat* +- Add exclude_files configuration option {pull}563[563] *Winlogbeat* diff --git a/filebeat/config/config.go b/filebeat/config/config.go index 5174d8f793e1..db45ea305b22 100644 --- a/filebeat/config/config.go +++ b/filebeat/config/config.go @@ -4,6 +4,7 @@ import ( "log" "os" "path/filepath" + "regexp" "time" "github.com/elastic/beats/libbeat/cfgfile" @@ -49,6 +50,8 @@ type ProspectorConfig struct { ScanFrequency string `yaml:"scan_frequency"` ScanFrequencyDuration time.Duration Harvester HarvesterConfig `yaml:",inline"` + ExcludeFiles []string `yaml:"exclude_files"` + ExcludeFilesRegexp []*regexp.Regexp } type HarvesterConfig struct { diff --git a/filebeat/crawler/prospector.go b/filebeat/crawler/prospector.go index b92eae189958..dfadc0d3f431 100644 --- a/filebeat/crawler/prospector.go +++ b/filebeat/crawler/prospector.go @@ -52,6 +52,10 @@ func (p *Prospector) setupProspectorConfig() error { if err != nil { return err } + config.ExcludeFilesRegexp, err = harvester.InitRegexps(config.ExcludeFiles) + if err != nil { + return err + } // Init File Stat list p.prospectorList = make(map[string]harvester.FileStat) @@ -219,11 +223,26 @@ func (p *Prospector) stdinRun(spoolChan chan *input.FileEvent) { } } +func (p *Prospector) isFileExcluded(file string) bool { + + config := &p.ProspectorConfig + + if len(config.ExcludeFilesRegexp) > 0 { + + if harvester.MatchAnyRegexps(config.ExcludeFilesRegexp, file) { + return true + } + } + + return false +} + // Scans the specific path which can be a glob (/**/**/*.log) // For all found files it is checked if a harvester should be started func (p *Prospector) scan(path string, output chan *input.FileEvent) { logp.Debug("prospector", "scan path %s", path) + logp.Debug("prospector", "exclude_files: %s", p.ProspectorConfig.ExcludeFiles) // Evaluate the path as a wildcards/shell glob matches, err := filepath.Glob(path) if err != nil { @@ -237,6 +256,12 @@ func (p *Prospector) scan(path string, output chan *input.FileEvent) { for _, file := range matches { logp.Debug("prospector", "Check file for harvesting: %s", file) + // check if the file is in the exclude_files list + if p.isFileExcluded(file) { + logp.Debug("prospector", "Exclude file: %s", file) + continue + } + // Stat the file, following any symlinks. fileinfo, err := os.Stat(file) diff --git a/filebeat/crawler/prospector_test.go b/filebeat/crawler/prospector_test.go index e186908b5e05..8ffe9a733f79 100644 --- a/filebeat/crawler/prospector_test.go +++ b/filebeat/crawler/prospector_test.go @@ -171,3 +171,23 @@ func TestProspectorInitInputTypeWrong(t *testing.T) { assert.Nil(t, err) assert.Equal(t, "log", prospector.ProspectorConfig.Harvester.InputType) } + +func TestProspectorFileExclude(t *testing.T) { + + prospectorConfig := config.ProspectorConfig{ + ExcludeFiles: []string{"\\.gz$"}, + Harvester: config.HarvesterConfig{ + BufferSize: 0, + }, + } + + prospector := Prospector{ + ProspectorConfig: prospectorConfig, + } + + prospector.Init() + + assert.True(t, prospector.isFileExcluded("/tmp/log/logw.gz")) + assert.False(t, prospector.isFileExcluded("/tmp/log/logw.log")) + +} diff --git a/filebeat/docs/configuration.asciidoc b/filebeat/docs/configuration.asciidoc index 6ee7c7b11568..521e1debe253 100644 --- a/filebeat/docs/configuration.asciidoc +++ b/filebeat/docs/configuration.asciidoc @@ -91,6 +91,16 @@ If both `include_lines` and `exclude_lines` are defined, then include_lines is c exclude_lines: ["^DBG"] ------------------------------------------------------------------------------------- +===== exclude_files + +A list of regular expressions to match the files to be ignored. By default no file is excluded. + +[source,yaml] +------------------------------------------------------------------------------------- + exclude_files: [".gz$"] +------------------------------------------------------------------------------------- +To ignore all the files with the `gz` extension. + [[configuration-fields]] ===== fields diff --git a/filebeat/etc/beat.yml b/filebeat/etc/beat.yml index de8de4cedc60..64821d7da294 100644 --- a/filebeat/etc/beat.yml +++ b/filebeat/etc/beat.yml @@ -40,6 +40,10 @@ filebeat: # exclude_lines. By default, all the lines are exported. # include_lines: ["^ERR", "^WARN"] + # Exclude files. A list of regular expressions to match. Filebeat drops the files that + # are matching any regular expression from the list. By default, no files are dropped. + # exclude_files: [".gz$"] + # Optional additional fields. These field can be freely picked # to add additional information to the crawled log files for filtering #fields: diff --git a/filebeat/etc/filebeat.yml b/filebeat/etc/filebeat.yml index 8139278c1610..34c3dc5e8240 100644 --- a/filebeat/etc/filebeat.yml +++ b/filebeat/etc/filebeat.yml @@ -40,6 +40,10 @@ filebeat: # exclude_lines. By default, all the lines are exported. # include_lines: ["^ERR", "^WARN"] + # Exclude files. A list of regular expressions to match. Filebeat drops the files that + # are matching any regular expression from the list. By default, no files are dropped. + # exclude_files: [".gz$"] + # Optional additional fields. These field can be freely picked # to add additional information to the crawled log files for filtering #fields: diff --git a/filebeat/harvester/log.go b/filebeat/harvester/log.go index ab1699eddca2..e7efa22a3e4c 100644 --- a/filebeat/harvester/log.go +++ b/filebeat/harvester/log.go @@ -5,7 +5,6 @@ import ( "fmt" "io" "os" - "regexp" "time" "github.com/elastic/beats/filebeat/config" @@ -332,35 +331,6 @@ func (h *Harvester) handleReadlineError(lastTimeRead time.Time, err error) error func (h *Harvester) Stop() { } -func InitRegexps(exprs []string) ([]*regexp.Regexp, error) { - - result := []*regexp.Regexp{} - - for _, exp := range exprs { - - rexp, err := regexp.CompilePOSIX(exp) - if err != nil { - logp.Err("Fail to compile the regexp %s: %s", exp, err) - return nil, err - } - result = append(result, rexp) - } - return result, nil -} - -func MatchAnyRegexps(regexps []*regexp.Regexp, text string) bool { - - for _, rexp := range regexps { - if rexp.MatchString(text) { - // drop line - return true - - } - } - - return false -} - const maxConsecutiveEmptyReads = 100 // timedReader keeps track of last time bytes have been read from underlying diff --git a/filebeat/harvester/util.go b/filebeat/harvester/util.go index 54b0c871588c..2bc9d137f48c 100644 --- a/filebeat/harvester/util.go +++ b/filebeat/harvester/util.go @@ -1,9 +1,11 @@ package harvester import ( + "regexp" + "time" + "github.com/elastic/beats/filebeat/harvester/encoding" "github.com/elastic/beats/libbeat/logp" - "time" ) // isLine checks if the given byte array is a line, means has a line ending \n @@ -60,3 +62,32 @@ func readlineString(bytes []byte, size int) (string, int, error) { s := string(bytes)[:len(bytes)-lineEndingChars(bytes)] return s, size, nil } + +// InitRegexps initializes a list of compiled regular expressions. +func InitRegexps(exprs []string) ([]*regexp.Regexp, error) { + + result := []*regexp.Regexp{} + + for _, exp := range exprs { + + rexp, err := regexp.CompilePOSIX(exp) + if err != nil { + logp.Err("Fail to compile the regexp %s: %s", exp, err) + return nil, err + } + result = append(result, rexp) + } + return result, nil +} + +// MatchAnyRegexps checks if the text matches any of the regular expressions +func MatchAnyRegexps(regexps []*regexp.Regexp, text string) bool { + + for _, rexp := range regexps { + if rexp.MatchString(text) { + return true + } + } + + return false +} diff --git a/filebeat/harvester/util_test.go b/filebeat/harvester/util_test.go new file mode 100644 index 000000000000..00364531db94 --- /dev/null +++ b/filebeat/harvester/util_test.go @@ -0,0 +1,19 @@ +package harvester + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestMatchAnyRegexps(t *testing.T) { + + patterns := []string{"\\.gz$"} + + regexps, err := InitRegexps(patterns) + + assert.Nil(t, err) + + assert.Equal(t, MatchAnyRegexps(regexps, "/var/log/log.gz"), true) + +} diff --git a/filebeat/tests/system/config/filebeat.yml.j2 b/filebeat/tests/system/config/filebeat.yml.j2 index f1df92b6b3fa..69ea994aa651 100644 --- a/filebeat/tests/system/config/filebeat.yml.j2 +++ b/filebeat/tests/system/config/filebeat.yml.j2 @@ -29,7 +29,10 @@ filebeat: {% if exclude_lines %} exclude_lines: {{exclude_lines}} {% endif %} - + {% if exclude_files %} + exclude_files: {{exclude_files}} + {% endif %} + spool_size: idle_timeout: 0.1s registry_file: {{ fb.working_dir + '/' }}{{ registryFile|default(".filebeat")}} diff --git a/filebeat/tests/system/test_prospector.py b/filebeat/tests/system/test_prospector.py index dbdbfe6305e8..efd862b649d3 100644 --- a/filebeat/tests/system/test_prospector.py +++ b/filebeat/tests/system/test_prospector.py @@ -105,7 +105,6 @@ def test_stdin(self): lambda: self.output_has(lines=iterations1+iterations2), max_timeout=15) - proc.kill_and_wait() objs = self.read_output() @@ -140,3 +139,38 @@ def test_rotating_ignore_older_larger_write_rate(self): max_timeout=15) proc.kill_and_wait() + + def test_exclude_files(self): + + self.render_config_template( + path=os.path.abspath(self.working_dir) + "/log/*", + exclude_files=[".gz$"] + ) + os.mkdir(self.working_dir + "/log/") + + testfile = self.working_dir + "/log/test.gz" + file = open(testfile, 'w') + file.write("line in gz file\n") + file.close() + + testfile = self.working_dir + "/log/test.log" + file = open(testfile, 'w') + file.write("line in log file\n") + file.close() + + filebeat = self.start_filebeat() + + self.wait_until( + lambda: self.output_has(lines=1), + max_timeout=15) + + # TODO: Find better solution when filebeat did crawl the file + # Idea: Special flag to filebeat so that filebeat is only doing and + # crawl and then finishes + filebeat.kill_and_wait() + + output = self.read_output() + + # Check that output file has the same number of lines as the log file + assert 1 == len(output) + assert output[0]["message"] == "line in log file"