Skip to content

Commit

Permalink
Merge pull request #409 from projectdiscovery/add_line_dedupe_funcs
Browse files Browse the repository at this point in the history
add line dedupe funcs
  • Loading branch information
Mzack9999 authored May 10, 2024
2 parents 40dbb9e + 1b5e022 commit 81d9b9e
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 1 deletion.
35 changes: 35 additions & 0 deletions file/file.go
Original file line number Diff line number Diff line change
Expand Up @@ -570,3 +570,38 @@ func OpenOrCreateFile(name string) (*os.File, error) {
}
return os.OpenFile(name, os.O_RDWR|os.O_CREATE, 0666)
}

// DedupeLines reads a file and removes duplicate lines from it.
// The function can be memory intensive for large files.
func DedupeLines(filename string) error {
file, err := os.Open(filename)
if err != nil {
return errors.Wrapf(err, "could not open file: %s", filename)
}
defer file.Close()

seenLines := make(map[string]struct{})
var deduplicatedLines []string

info, err := file.Stat()
if err != nil {
return err
}
scanner := bufio.NewScanner(file)
maxSize := int(info.Size())
buffer := make([]byte, 0, maxSize)
scanner.Buffer(buffer, maxSize)
for scanner.Scan() {
line := scanner.Text()
if _, exists := seenLines[line]; !exists {
seenLines[line] = struct{}{}
deduplicatedLines = append(deduplicatedLines, line)
}
}

if err := scanner.Err(); err != nil {
return errors.Wrapf(err, "could not read file: %s", filename)
}

return os.WriteFile(filename, []byte(strings.Join(deduplicatedLines, "\n")+"\n"), 0644)
}
20 changes: 19 additions & 1 deletion folder/folderutil.go
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
package folderutil

import (
"errors"
"os"
"path/filepath"
"runtime"
"strings"

"github.com/pkg/errors"
fileutil "github.com/projectdiscovery/utils/file"
mapsutil "github.com/projectdiscovery/utils/maps"
)
Expand Down Expand Up @@ -237,3 +237,21 @@ func SyncDirectory(source, destination string) error {

return nil
}

// DedupeLinesInFiles deduplicates lines in all files in a directory
// The function can be memory intensive for directories with large files.
func DedupeLinesInFiles(dir string) error {
err := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error {
if err != nil {
return err
}
if !info.IsDir() {
return fileutil.DedupeLines(path)
}
return nil
})
if err != nil {
return errors.Wrapf(err, "error processing directory %s", dir)
}
return nil
}

0 comments on commit 81d9b9e

Please sign in to comment.