Skip to content

Commit

Permalink
Expose raw utf16 value via custom String type
Browse files Browse the repository at this point in the history
  • Loading branch information
csnewman committed Jun 5, 2024
1 parent aefe164 commit 6527911
Show file tree
Hide file tree
Showing 5 changed files with 53 additions and 44 deletions.
2 changes: 1 addition & 1 deletion code.go
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ func (n ArrayLenOpNode) String() string {
type NewInstanceOpNode struct {
Raw Op
Dst uint8
Type string
Type String
}

func (n NewInstanceOpNode) RawOp() Op {
Expand Down
18 changes: 9 additions & 9 deletions iter.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ type StringIter struct {

type StringNode struct {
Id uint32
Value string
Value String
}

func (r *Reader) StringIter() *StringIter {
Expand Down Expand Up @@ -62,11 +62,11 @@ type ClassIter struct {

type ClassNode struct {
Id uint32
Name string
Name String
AccessFlags uint32
SuperClass string
Interfaces []string
SourceFile string
SuperClass String
Interfaces []String
SourceFile String

StaticFields []FieldNode
InstanceFields []FieldNode
Expand All @@ -78,14 +78,14 @@ type FieldNode struct {
Id uint32
AccessFlags uint32
Type TypeDescriptor
Name string
Name String
}

type MethodNode struct {
Id uint32
AccessFlags uint32
Name string
Shorty string
Name String
Shorty String
ReturnType TypeDescriptor
Params []TypeDescriptor
CodeOff uint32
Expand Down Expand Up @@ -172,7 +172,7 @@ func (r *Reader) ReadClassAndParse(id uint32) (ClassNode, error) {
return res, fmt.Errorf("%w: bad interface list: %w", ErrBadClass, err)
}

res.Interfaces = make([]string, len(list.TypeIds))
res.Interfaces = make([]String, len(list.TypeIds))

for p, id := range list.TypeIds {
parsedDesc, err := r.ReadTypeAndParse(uint32(id))
Expand Down
43 changes: 30 additions & 13 deletions reader.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"errors"
"fmt"
"io"
"unicode/utf16"
)

var (
Expand Down Expand Up @@ -109,33 +110,49 @@ func Read(file io.ReaderAt, opts ...Opt) (*Reader, error) {
return r, nil
}

func (r *Reader) ReadString(id uint32) (string, error) {
type String struct {
Raw []uint16
Parsed string
}

func StringFromUTF16(points []uint16) String {
return String{
Raw: points,
Parsed: string(utf16.Decode(points)),
}
}

func (s String) String() string {
return s.Parsed
}

func (r *Reader) ReadString(id uint32) (String, error) {
if id >= r.StringIDCount {
return "", ErrInvalidStringID
return String{}, ErrInvalidStringID
}

idPos := r.stringIDOff + (id * 4)

strPos, err := r.readUint(idPos)
if err != nil {
return "", err
return String{}, err
}

strSize, n, err := r.readUleb128(strPos)
if err != nil {
return "", err
return String{}, err
}

if strSize == 0 {
return "", nil
return String{}, nil
}

// mutf-8 encodes upto 3 bytes per char
data := make([]byte, strSize*3+1)

rsize, err := r.file.ReadAt(data, int64(strPos+n))
if err != nil && err != io.EOF {
return "", err
return String{}, err
}

// Find the null terminating byte
Expand All @@ -150,12 +167,12 @@ func (r *Reader) ReadString(id uint32) (string, error) {
}

if pos == rsize {
return "", ErrNoStringEnd
return String{}, ErrNoStringEnd
}

str, err := MUTF8Decode(data[:pos], int(strSize))
if err != nil {
return "", fmt.Errorf("mutf8-8 decode failed: %w", err)
return String{}, fmt.Errorf("mutf8-8 decode failed: %w", err)
}

return str, nil
Expand Down Expand Up @@ -228,9 +245,9 @@ type Field struct {
}

type FieldRef struct {
Class string
Class String
Type TypeDescriptor
Name string
Name String
}

func (r FieldRef) String() string {
Expand Down Expand Up @@ -306,9 +323,9 @@ type Method struct {
}

type MethodRef struct {
Class string
Name string
Shorty string
Class String
Name String
Shorty String
ReturnType TypeDescriptor
Params []TypeDescriptor
}
Expand Down
14 changes: 7 additions & 7 deletions types.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,19 +30,19 @@ func (r *Reader) ReadTypeAndParse(id uint32) (TypeDescriptor, error) {
type TypeDescriptor struct {
Type uint8
ArrayLength int
ClassName string
ClassName String
}

func ParseTypeDescriptor(value string) (TypeDescriptor, error) {
func ParseTypeDescriptor(value String) (TypeDescriptor, error) {
var res TypeDescriptor

l := len(value)
l := len(value.Raw)

if l == 0 {
return res, ErrEmptyTypeDesc
}

for value[res.ArrayLength] == '[' {
for value.Raw[res.ArrayLength] == '[' {
res.ArrayLength++

// Ensure there is a next character
Expand All @@ -51,7 +51,7 @@ func ParseTypeDescriptor(value string) (TypeDescriptor, error) {
}
}

res.Type = value[res.ArrayLength]
res.Type = uint8(value.Raw[res.ArrayLength])

// Check if a string
if res.Type != 'L' {
Expand All @@ -63,15 +63,15 @@ func ParseTypeDescriptor(value string) (TypeDescriptor, error) {
return res, nil
}

if value[l-1] != ';' {
if value.Raw[l-1] != ';' {
return res, fmt.Errorf("%w: %v", ErrBadTypeDesc, value)
}

if l-2-res.ArrayLength <= 0 {
return res, fmt.Errorf("%w: %v", ErrBadTypeDesc, value)
}

res.ClassName = value[1+res.ArrayLength : l-1]
res.ClassName = StringFromUTF16(value.Raw[1+res.ArrayLength : l-1])

return res, nil
}
Expand Down
20 changes: 6 additions & 14 deletions utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@ import (
"fmt"
"io"
"math"
"unicode/utf16"
"unicode/utf8"
)

const reNotImpl = "reverse endian not implemented"
Expand Down Expand Up @@ -90,42 +88,36 @@ func (r *Reader) readUleb128(pos uint32) (uint32, uint32, error) {

var ErrMUTF8 = errors.New("invalid encoding")

func MUTF8Decode(d []byte, expectedSize int) (string, error) {
if utf8.Valid(d) {
return string(d), nil
}

func MUTF8Decode(d []byte, expectedSize int) (String, error) {
inLen := len(d)
buf := make([]uint16, 0, expectedSize)

for i := 0; i < inLen; {
if d[i] == 0 {
return "", fmt.Errorf("%w: null unexpected", ErrMUTF8)
return String{}, fmt.Errorf("%w: null unexpected", ErrMUTF8)
} else if d[i] < 0x80 {
buf = append(buf, uint16(d[i]))
i++
} else if d[i]&0xE0 == 0xC0 {
if i+1 >= inLen {
return "", fmt.Errorf("%w: bytes missing", ErrMUTF8)
return String{}, fmt.Errorf("%w: bytes missing", ErrMUTF8)
}

buf = append(buf, ((uint16(d[i])&0x1F)<<6)|(uint16(d[i+1])&0x3F))
i += 2
} else if d[i]&0xF0 == 0xE0 {
if i+2 >= inLen {
return "", fmt.Errorf("%w: bytes missing", ErrMUTF8)
return String{}, fmt.Errorf("%w: bytes missing", ErrMUTF8)
}

buf = append(buf, ((uint16(d[i])&0x0F)<<12)|((uint16(d[i+1])&0x3F)<<6)|(uint16(d[i+2])&0x3F))
i += 3
} else {
return "", fmt.Errorf("%w: unexpected byte", ErrMUTF8)
return String{}, fmt.Errorf("%w: unexpected byte", ErrMUTF8)
}
}

runes := utf16.Decode(buf)

return string(runes), nil
return StringFromUTF16(buf), nil
}

func (r *Reader) readSleb128(pos uint32) (int32, uint32, error) {
Expand Down

0 comments on commit 6527911

Please sign in to comment.