From 8eb548f6f4675452e12947f64792560b7fdd2bfe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Do=C4=9Fan=20Can=20Bak=C4=B1r?= Date: Fri, 10 May 2024 11:27:40 +0300 Subject: [PATCH 1/2] add line dedupe funcs --- file/file.go | 28 ++++++++++++++++++++++++++++ folder/folderutil.go | 20 +++++++++++++++++++- 2 files changed, 47 insertions(+), 1 deletion(-) diff --git a/file/file.go b/file/file.go index a6c18fe..e8a2a39 100644 --- a/file/file.go +++ b/file/file.go @@ -570,3 +570,31 @@ func OpenOrCreateFile(name string) (*os.File, error) { } return os.OpenFile(name, os.O_RDWR|os.O_CREATE, 0666) } + +// DedupeLines reads a file and removes duplicate lines from it. +// The function can be memory intensive for large files. +func DedupeLines(filename string) error { + file, err := os.Open(filename) + if err != nil { + return errors.Wrapf(err, "could not open file: %s", filename) + } + defer file.Close() + + seenLines := make(map[string]struct{}) + var deduplicatedLines []string + + scanner := bufio.NewScanner(file) + for scanner.Scan() { + line := scanner.Text() + if _, exists := seenLines[line]; !exists { + seenLines[line] = struct{}{} + deduplicatedLines = append(deduplicatedLines, line) + } + } + + if err := scanner.Err(); err != nil { + return errors.Wrapf(err, "could not read file: %s", filename) + } + + return os.WriteFile(filename, []byte(strings.Join(deduplicatedLines, "\n")+"\n"), 0644) +} diff --git a/folder/folderutil.go b/folder/folderutil.go index 6201b00..d76b67f 100644 --- a/folder/folderutil.go +++ b/folder/folderutil.go @@ -1,12 +1,12 @@ package folderutil import ( - "errors" "os" "path/filepath" "runtime" "strings" + "github.com/pkg/errors" fileutil "github.com/projectdiscovery/utils/file" mapsutil "github.com/projectdiscovery/utils/maps" ) @@ -237,3 +237,21 @@ func SyncDirectory(source, destination string) error { return nil } + +// DedupeLinesInFiles deduplicates lines in all files in a directory +// The function can be memory intensive for directories with large files. +func DedupeLinesInFiles(dir string) error { + err := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error { + if err != nil { + return err + } + if !info.IsDir() { + return fileutil.DedupeLines(path) + } + return nil + }) + if err != nil { + return errors.Wrapf(err, "error processing directory %s", dir) + } + return nil +} From 1b5e022e7b669bda8fb54bd0d0fb1b2fd8199712 Mon Sep 17 00:00:00 2001 From: mzack9999 Date: Fri, 10 May 2024 10:42:37 +0200 Subject: [PATCH 2/2] increasing buffer to file size --- file/file.go | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/file/file.go b/file/file.go index e8a2a39..08ae130 100644 --- a/file/file.go +++ b/file/file.go @@ -583,7 +583,14 @@ func DedupeLines(filename string) error { seenLines := make(map[string]struct{}) var deduplicatedLines []string + info, err := file.Stat() + if err != nil { + return err + } scanner := bufio.NewScanner(file) + maxSize := int(info.Size()) + buffer := make([]byte, 0, maxSize) + scanner.Buffer(buffer, maxSize) for scanner.Scan() { line := scanner.Text() if _, exists := seenLines[line]; !exists {