Skip to content

Commit

Permalink
add multiple fields to remove duplicates
Browse files Browse the repository at this point in the history
  • Loading branch information
mayuze committed Dec 16, 2024
1 parent 9a23435 commit ff27c14
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 28 deletions.
4 changes: 4 additions & 0 deletions HISTORY.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
## v0.2.27 add multiple fields to remove duplicates in dedup mode

- dedup of multiple fields to remove duplicates use as: ```fofa dedup -d host,port,ip,protocol -i input.csv -o output.csv```

## v0.2.26 update jsRender mode and headline in dump mode

- update jsRender deadline
Expand Down
58 changes: 30 additions & 28 deletions cmd/fofa/cmd/duplicate.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,46 +79,48 @@ func writeCSV(filePath string, records [][]string) error {
return nil
}

func deduplicate(records [][]string, fieldName string) ([][]string, error) {
headers := records[0]
var index int
found := false
// indexOf 查找字段在表头中的索引
func indexOf(headers []string, field string) int {
for i, header := range headers {
if header == fieldName {
index = i
found = true
break
if header == field {
return i
}
}
if !found {
return nil, fmt.Errorf("field %s not exist", fieldName)
return -1
}

func deduplicates(records [][]string, fields []string) ([][]string, error) {
if len(records) < 2 {
return nil, errors.New("deduplicate failed: CSV file is empty")
}

// 获取字段索引
fieldIndexes := make([]int, 0, len(fields))
for _, field := range fields {
index := indexOf(fields, field)
if index == -1 {
return nil, fmt.Errorf("field '%s' not found in headers", field)
}
fieldIndexes = append(fieldIndexes, index)
}

// 去重逻辑
seen := make(map[string]bool)
var uniqueRecords [][]string
uniqueRecords = append(uniqueRecords, headers)
uniqueRows := [][]string{fields}

for _, record := range records[1:] {
key := record[index]
for _, row := range records[1:] {
keyParts := make([]string, len(fieldIndexes))
for i, idx := range fieldIndexes {
keyParts[i] = row[idx]
}
key := strings.Join(keyParts, "|")
if !seen[key] {
uniqueRecords = append(uniqueRecords, record)
seen[key] = true
uniqueRows = append(uniqueRows, row)
}
}
return uniqueRecords, nil
}

func deduplicates(records [][]string, fields []string) ([][]string, error) {
var uniqueRecords [][]string
for _, field := range fields {
uniq, err := deduplicate(records, field)
if err != nil {
return nil, errors.New("deduplicate failed: " + err.Error())
}
uniqueRecords = append(uniqueRecords, uniq...)
}

return uniqueRecords, nil
return uniqueRows, nil
}

func deduplicateAction(ctx *cli.Context) error {
Expand Down

0 comments on commit ff27c14

Please sign in to comment.