-
Notifications
You must be signed in to change notification settings - Fork 1.8k
/
Copy pathsources.go
425 lines (386 loc) · 16.6 KB
/
sources.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
package sources
import (
"sync"
"google.golang.org/protobuf/types/known/anypb"
"github.com/trufflesecurity/trufflehog/v3/pkg/common"
"github.com/trufflesecurity/trufflehog/v3/pkg/context"
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/source_metadatapb"
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/sourcespb"
)
type (
SourceID int64
JobID int64
)
// Chunk contains data to be decoded and scanned along with context on where it came from.
//
// **Important:** The order of the fields in this struct is specifically designed to optimize
// struct alignment and minimize memory usage. Do not change the field order without carefully considering
// the potential impact on memory consumption.
// Ex: https://go.dev/play/p/Azf4a7O-DhC
type Chunk struct {
// Data is the data to decode and scan.
Data []byte
// SourceName is the name of the Source that produced the chunk.
SourceName string
// SourceID is the ID of the source that the Chunk originated from.
SourceID SourceID
// JobID is the ID of the job that the Chunk originated from.
JobID JobID
// SecretID is the ID of the secret, if it exists.
// Only secrets that are being reverified will have a SecretID.
SecretID int64
// SourceMetadata holds the context of where the Chunk was found.
SourceMetadata *source_metadatapb.MetaData
// SourceType is the type of Source that produced the chunk.
SourceType sourcespb.SourceType
// Verify specifies whether any secrets in the Chunk should be verified.
Verify bool
}
// ChunkingTarget specifies criteria for a targeted chunking process.
// Instead of collecting data indiscriminately, this struct allows the caller
// to specify particular subsets of data they're interested in. This becomes
// especially useful when one needs to verify or recheck specific data points
// without processing the entire dataset.
type ChunkingTarget struct {
// QueryCriteria represents specific parameters or conditions to target the chunking process.
QueryCriteria *source_metadatapb.MetaData
// SecretID is the ID of the secret.
SecretID int64
}
// Source defines the interface required to implement a source chunker.
type Source interface {
// Type returns the source type, used for matching against configuration and jobs.
Type() sourcespb.SourceType
// SourceID returns the initialized source ID used for tracking relationships in the DB.
SourceID() SourceID
// JobID returns the initialized job ID used for tracking relationships in the DB.
JobID() JobID
// Init initializes the source.
Init(aCtx context.Context, name string, jobId JobID, sourceId SourceID, verify bool, connection *anypb.Any, concurrency int) error
// Chunks emits data over a channel which is then decoded and scanned for secrets.
// By default, data is obtained indiscriminately. However, by providing one or more
// ChunkingTarget parameters, the caller can direct the function to retrieve
// specific chunks of data. This targeted approach allows for efficient and
// intentional data processing, beneficial when verifying or rechecking specific data points.
Chunks(ctx context.Context, chunksChan chan *Chunk, targets ...ChunkingTarget) error
// GetProgress is the completion progress (percentage) for Scanned Source.
GetProgress() *Progress
}
// SourceUnitEnumChunker are the two required interfaces to support enumerating
// and chunking of units.
type SourceUnitEnumChunker interface {
SourceUnitEnumerator
SourceUnitChunker
}
// SourceUnitUnmarshaller defines an optional interface a Source can implement
// to support units coming from an external source.
type SourceUnitUnmarshaller interface {
UnmarshalSourceUnit(data []byte) (SourceUnit, error)
}
// SourceUnitEnumerator defines an optional interface a Source can implement to
// support enumerating an initialized Source into SourceUnits.
type SourceUnitEnumerator interface {
// Enumerate creates 0 or more units from an initialized source,
// reporting them or any errors to the UnitReporter. This method is
// synchronous but can be called in a goroutine to support concurrent
// enumeration and chunking. An error should only be returned from this
// method in the case of context cancellation, fatal source errors, or
// errors returned by the reporter All other errors related to unit
// enumeration are tracked by the UnitReporter.
Enumerate(ctx context.Context, reporter UnitReporter) error
}
// UnitReporter defines the interface a source will use to report whether a
// unit was found during enumeration. Either method may be called any number of
// times. Implementors of this interface should allow for concurrent calls.
type UnitReporter interface {
UnitOk(ctx context.Context, unit SourceUnit) error
UnitErr(ctx context.Context, err error) error
}
// SourceUnitChunker defines an optional interface a Source can implement to
// support chunking a single SourceUnit.
type SourceUnitChunker interface {
// ChunkUnit creates 0 or more chunks from a unit, reporting them or
// any errors to the ChunkReporter. An error should only be returned
// from this method in the case of context cancellation, fatal source
// errors, or errors returned by the reporter. All other errors related
// to unit chunking are tracked by the ChunkReporter.
ChunkUnit(ctx context.Context, unit SourceUnit, reporter ChunkReporter) error
}
// ChunkReporter defines the interface a source will use to report whether a
// chunk was found during unit chunking. Either method may be called any number
// of times. Implementors of this interface should allow for concurrent calls.
type ChunkReporter interface {
ChunkOk(ctx context.Context, chunk Chunk) error
ChunkErr(ctx context.Context, err error) error
}
type SourceUnitKind string
// SourceUnit is an object that represents a Source's unit of work. This is
// used as the output of enumeration, progress reporting, and job distribution.
type SourceUnit interface {
// SourceUnitID uniquely identifies a source unit. It does not need to
// be human readable or two-way, however, it should be canonical and
// stable across runs.
SourceUnitID() (string, SourceUnitKind)
// Display is the human readable representation of the SourceUnit.
Display() string
}
// DockerConfig defines the optional configuration for a Docker source.
type DockerConfig struct {
// Images is the list of images to scan.
Images []string
// BearerToken is the token to use to authenticate with the source.
BearerToken string
// UseDockerKeychain determines whether to use the Docker keychain.
UseDockerKeychain bool
}
// GCSConfig defines the optional configuration for a GCS source.
type GCSConfig struct {
// CloudCred determines whether to use cloud credentials.
// This can NOT be used with a secret.
CloudCred,
// WithoutAuth is a flag to indicate whether to use authentication.
WithoutAuth bool
// ApiKey is the API key to use to authenticate with the source.
ApiKey,
// ProjectID is the project ID to use to authenticate with the source.
ProjectID,
// ServiceAccount is the service account to use to authenticate with the source.
ServiceAccount string
// MaxObjectSize is the maximum object size to scan.
MaxObjectSize int64
// Concurrency is the number of concurrent workers to use to scan the source.
Concurrency int
// IncludeBuckets is a list of buckets to include in the scan.
IncludeBuckets,
// ExcludeBuckets is a list of buckets to exclude from the scan.
ExcludeBuckets,
// IncludeObjects is a list of objects to include in the scan.
IncludeObjects,
// ExcludeObjects is a list of objects to exclude from the scan.
ExcludeObjects []string
}
// GitConfig defines the optional configuration for a git source.
type GitConfig struct {
// HeadRef is the head reference to use to scan from.
HeadRef string
// BaseRef is the base reference to use to scan from.
BaseRef string
// MaxDepth is the maximum depth to scan the source.
MaxDepth int
// Bare is an indicator to handle bare repositories properly.
Bare bool
// URI is the URI of the repository to scan. file://, http://, https:// and ssh:// are supported.
URI string
// IncludePathsFile is the path to a file containing a list of regexps to include in the scan.
IncludePathsFile string
// ExcludePathsFile is the path to a file containing a list of regexps to exclude from the scan.
ExcludePathsFile string
// ExcludeGlobs is a list of comma separated globs to exclude from the scan.
// This differs from the Filter exclusions as ExcludeGlobs is applied at the `git log -p` level
ExcludeGlobs string
// SkipBinaries allows skipping binary files from the scan.
SkipBinaries bool
}
// GithubConfig defines the optional configuration for a github source.
type GithubConfig struct {
// Endpoint is the endpoint of the source.
Endpoint string
// Token is the token to use to authenticate with the source.
Token string
// IncludeForks indicates whether to include forks in the scan.
IncludeForks bool
// IncludeMembers indicates whether to include members in the scan.
IncludeMembers bool
// Concurrency is the number of concurrent workers to use to scan the source.
Concurrency int
// Repos is the list of repositories to scan.
Repos []string
// Orgs is the list of organizations to scan.
Orgs []string
// ExcludeRepos is a list of repositories to exclude from the scan.
ExcludeRepos []string
// IncludeRepos is a list of repositories to include in the scan.
IncludeRepos []string
// Filter is the filter to use to scan the source.
Filter *common.Filter
// IncludeIssueComments indicates whether to include GitHub issue comments in the scan.
IncludeIssueComments bool
// IncludePullRequestComments indicates whether to include GitHub pull request comments in the scan.
IncludePullRequestComments bool
// IncludeGistComments indicates whether to include GitHub gist comments in the scan.
IncludeGistComments bool
// SkipBinaries allows skipping binary files from the scan.
SkipBinaries bool
// IncludeWikis indicates whether to include repository wikis in the scan.
IncludeWikis bool
// CommentsTimeframeDays indicates how many days of comments to include in the scan.
CommentsTimeframeDays uint32
}
// GitHubExperimentalConfig defines the optional configuration for an experimental GitHub source.
type GitHubExperimentalConfig struct {
// Repository is the repository to scan.
Repository string
// Token is the token to use to authenticate with the source.
Token string
// ObjectDiscovery indicates whether to discover all commit objects (CFOR) in the repository.
ObjectDiscovery bool
// CollisionThreshold is the number of short-sha collisions tolerated during hidden data enumeration. Default is 1.
CollisionThreshold int
// DeleteCachedData indicates whether to delete cached data.
DeleteCachedData bool
}
// GitlabConfig defines the optional configuration for a gitlab source.
type GitlabConfig struct {
// Endpoint is the endpoint of the source.
Endpoint string
// Token is the token to use to authenticate with the source.
Token string
// Repos is the list of repositories to scan.
Repos []string
// Filter is the filter to use to scan the source.
Filter *common.Filter
// SkipBinaries allows skipping binary files from the scan.
SkipBinaries bool
}
// FilesystemConfig defines the optional configuration for a filesystem source.
type FilesystemConfig struct {
// Paths is the list of files and directories to scan.
Paths []string
// IncludePathsFile is the path to a file containing a list of regexps to include in the scan.
IncludePathsFile string
// ExcludePathsFile is the path to a file containing a list of regexps to exclude from the scan.
ExcludePathsFile string
}
// S3Config defines the optional configuration for an S3 source.
type S3Config struct {
// CloudCred determines whether to use cloud credentials.
// This can NOT be used with a secret.
CloudCred bool
// Key is any key to use to authenticate with the source.
Key,
// Secret is any secret to use to authenticate with the source.
Secret,
// Temporary session token associated with a temporary access key id and secret key.
SessionToken string
// Buckets is the list of buckets to scan.
Buckets []string
// IgnoreBuckets is the list buckets to ignore.
IgnoreBuckets []string
// Roles is the list of Roles to use.
Roles []string
// MaxObjectSize is the maximum object size to scan.
MaxObjectSize int64
}
// SyslogConfig defines the optional configuration for a syslog source.
type SyslogConfig struct {
// Address used to connect to the source.
Address,
// Protocol used to connect to the source.
Protocol,
// CertPath is the path to the certificate to use to connect to the source.
CertPath,
// Format is the format used to connect to the source.
Format,
// KeyPath is the path to the key to use to connect to the source.
KeyPath string
// Concurrency is the number of concurrent workers to use to scan the source.
Concurrency int
}
// PostmanConfig defines the optional configuration for a Postman source.
type PostmanConfig struct {
// Workspace UUID(s) or file path(s) to Postman workspace (.zip)
Workspaces []string
// Collection ID(s) or file path(s) to Postman collection (.json)
Collections []string
// Environment ID(s) or file path(s) to Postman environment (.json)
Environments []string
// Token is the token to use to authenticate with the API.
Token string
// IncludeCollections is a list of Collections to include in the scan.
IncludeCollections []string
// IncludeEnvironment is a list of Environments to include in the scan.
IncludeEnvironments []string
// ExcludeCollections is a list of Collections to exclude in the scan.
ExcludeCollections []string
// ExcludeEnvironment is a list of Environments to exclude in the scan.
ExcludeEnvironments []string
// Concurrency is the number of concurrent workers to use to scan the source.
Concurrency int
// CollectionPaths is the list of paths to Postman collections.
CollectionPaths []string
// WorkspacePaths is the list of paths to Postman workspaces.
WorkspacePaths []string
// EnvironmentPaths is the list of paths to Postman environments.
EnvironmentPaths []string
// Filter is the filter to use to scan the source.
Filter *common.Filter
}
type ElasticsearchConfig struct {
Nodes []string
Username string
Password string
CloudID string
APIKey string
ServiceToken string
IndexPattern string
QueryJSON string
SinceTimestamp string
BestEffortScan bool
}
// Progress is used to update job completion progress across sources.
type Progress struct {
mut sync.Mutex
PercentComplete int64
Message string
EncodedResumeInfo string
SectionsCompleted int32
SectionsRemaining int32
}
// Validator is an interface for validating a source. Sources can optionally implement this interface to validate
// their configuration.
type Validator interface {
Validate(ctx context.Context) []error
}
// SetProgressComplete sets job progress information for a running job based on the highest level objects in the source.
// i is the current iteration in the loop of target scope
// scope should be the len(scopedItems)
// message is the public facing user information about the current progress
// encodedResumeInfo is an optional string representing any information necessary to resume the job if interrupted
//
// NOTE: SetProgressOngoing should be used when source does not yet know how many items it is scanning (scope)
// and does not want to display a percentage complete
func (p *Progress) SetProgressComplete(i, scope int, message, encodedResumeInfo string) {
p.mut.Lock()
defer p.mut.Unlock()
p.Message = message
p.EncodedResumeInfo = encodedResumeInfo
p.SectionsCompleted = int32(i)
p.SectionsRemaining = int32(scope)
// If the iteration and scope are both 0, completion is 100%.
if i == 0 && scope == 0 {
p.PercentComplete = 100
return
}
p.PercentComplete = int64((float64(i) / float64(scope)) * 100)
}
// SetProgressOngoing sets information about the current running job based on
// the highest level objects in the source.
// message is the public facing user information about the current progress
// encodedResumeInfo is an optional string representing any information necessary to resume the job if interrupted
//
// NOTE: This method should be used over SetProgressComplete when the source does
// not yet know how many items it is scanning and does not want to display a percentage complete.
func (p *Progress) SetProgressOngoing(message string, encodedResumeInfo string) {
p.mut.Lock()
defer p.mut.Unlock()
p.Message = message
p.EncodedResumeInfo = encodedResumeInfo
// Explicitly set SectionsRemaining to 0 so the frontend does not display a percent.
p.SectionsRemaining = 0
}
// GetProgress gets job completion percentage for metrics reporting.
func (p *Progress) GetProgress() *Progress {
p.mut.Lock()
defer p.mut.Unlock()
return p
}