Skip to content

Commit

Permalink
Detect encoding changes while parsing diff (go-gitea#16330)
Browse files Browse the repository at this point in the history
* Detect encoding changes while parsing diff
  • Loading branch information
jpraet authored and zhaoxin committed Jul 13, 2021
1 parent b04748d commit a748490
Showing 1 changed file with 31 additions and 19 deletions.
50 changes: 31 additions & 19 deletions services/gitdiff/gitdiff.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ import (

"github.com/sergi/go-diff/diffmatchpatch"
stdcharset "golang.org/x/net/html/charset"
"golang.org/x/text/encoding"
"golang.org/x/text/transform"
)

Expand Down Expand Up @@ -883,35 +884,46 @@ parsingLoop:

}

// FIXME: There are numerous issues with this:
// TODO: There are numerous issues with this:
// - we might want to consider detecting encoding while parsing but...
// - we're likely to fail to get the correct encoding here anyway as we won't have enough information
// - and this doesn't really account for changes in encoding
var buf bytes.Buffer
var diffLineTypeBuffers = make(map[DiffLineType]*bytes.Buffer, 3)
var diffLineTypeDecoders = make(map[DiffLineType]*encoding.Decoder, 3)
diffLineTypeBuffers[DiffLinePlain] = new(bytes.Buffer)
diffLineTypeBuffers[DiffLineAdd] = new(bytes.Buffer)
diffLineTypeBuffers[DiffLineDel] = new(bytes.Buffer)
for _, f := range diff.Files {
buf.Reset()
for _, buffer := range diffLineTypeBuffers {
buffer.Reset()
}
for _, sec := range f.Sections {
for _, l := range sec.Lines {
if l.Type == DiffLineSection {
continue
}
buf.WriteString(l.Content[1:])
buf.WriteString("\n")
diffLineTypeBuffers[l.Type].WriteString(l.Content[1:])
diffLineTypeBuffers[l.Type].WriteString("\n")
}
}
charsetLabel, err := charset.DetectEncoding(buf.Bytes())
if charsetLabel != "UTF-8" && err == nil {
encoding, _ := stdcharset.Lookup(charsetLabel)
if encoding != nil {
d := encoding.NewDecoder()
for _, sec := range f.Sections {
for _, l := range sec.Lines {
if l.Type == DiffLineSection {
continue
}
if c, _, err := transform.String(d, l.Content[1:]); err == nil {
l.Content = l.Content[0:1] + c
}
for lineType, buffer := range diffLineTypeBuffers {
diffLineTypeDecoders[lineType] = nil
if buffer.Len() == 0 {
continue
}
charsetLabel, err := charset.DetectEncoding(buffer.Bytes())
if charsetLabel != "UTF-8" && err == nil {
encoding, _ := stdcharset.Lookup(charsetLabel)
if encoding != nil {
diffLineTypeDecoders[lineType] = encoding.NewDecoder()
}
}
}
for _, sec := range f.Sections {
for _, l := range sec.Lines {
decoder := diffLineTypeDecoders[l.Type]
if decoder != nil {
if c, _, err := transform.String(decoder, l.Content[1:]); err == nil {
l.Content = l.Content[0:1] + c
}
}
}
Expand Down

0 comments on commit a748490

Please sign in to comment.