Skip to content

Commit

Permalink
fix: utf16 encoding error
Browse files Browse the repository at this point in the history
  • Loading branch information
terasum committed Oct 7, 2023
1 parent bf870df commit 70ab78e
Show file tree
Hide file tree
Showing 7 changed files with 141 additions and 68 deletions.
8 changes: 8 additions & 0 deletions frontend/src/view/main/MainFunctions.vue
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,14 @@ function listenInputWordUpdate() {
after((result) => {
switch (name) {
case 'updateInputSearchWord': {
// inputWord.value = dictQueryStore.inputSearchWord;
break;
}
case 'forwardHistory': {
inputWord.value = dictQueryStore.inputSearchWord;
break;
}
case 'backHistory': {
inputWord.value = dictQueryStore.inputSearchWord;
break;
}
Expand Down
3 changes: 3 additions & 0 deletions internal/gomdict/mdict.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package gomdict
import (
"errors"
"fmt"
"github.com/terasum/medict/internal/utils"
"path/filepath"
"sort"
"strings"
Expand Down Expand Up @@ -118,6 +119,8 @@ func (mdict *Mdict) Search(word string) ([]*MDictKeyBlockEntry, error) {
func (x *MDictKeyBlockEntry) Distance(e bktree.Entry) int {
a := x.KeyWord
b := e.(*MDictKeyBlockEntry).KeyWord
a = utils.StrToUnicode(a)
b = utils.StrToUnicode(b)

return levenshtein.Distance(a, b)
}
Expand Down
18 changes: 13 additions & 5 deletions internal/gomdict/mdict_base.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,10 @@ import (
"encoding/binary"
"errors"
"fmt"
"github.com/rasky/go-lzo"
"os"
"strconv"
"strings"

"github.com/rasky/go-lzo"
)

// ReadDictHeader reads the dictionary header.
Expand Down Expand Up @@ -84,12 +83,13 @@ func (mdict *MdictBase) ReadDictHeader() error {

// Handle encoding
encoding := headerInfo.Encoding
encoding = strings.ToLower(encoding)
switch encoding {
case "GBK", "GB2312":
case "GBK", "GB2312", "gbk", "gb2312":
meta.Encoding = EncodingGb18030
case "Big5", "BIG5":
case "Big5", "BIG5", "big5":
meta.Encoding = EncodingBig5
case "utf16", "utf-16":
case "utf16", "utf-16", "UTF-16":
meta.Encoding = EncodingUtf16
default:
meta.Encoding = EncodingUtf8
Expand Down Expand Up @@ -624,6 +624,14 @@ func (mdict *MdictBase) splitKeyBlock(keyBlock []byte) []*MDictKeyBlockEntry {
keyTextBytes := keyBlock[keyStartIndex+mdict.Meta.NumberWidth : keyEndIndex]
keyText := string(keyTextBytes)
var err error

if mdict.Meta.Encoding == EncodingUtf16 {
keyText, err = decodeLittleEndianUtf16(keyTextBytes)
if err != nil {
keyText = string(keyTextBytes)
}
}

if mdict.FileType == MdictTypeMdd {
keyText, err = decodeLittleEndianUtf16(keyTextBytes)
if err != nil {
Expand Down
126 changes: 63 additions & 63 deletions internal/gomdict/mdict_base_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -159,66 +159,66 @@ func TestMdictBase_ReadDictHeader3(t *testing.T) {

}

//
///*
//#include "mdict_extern.h"
//#include "mdict.h"
//#include <stdlib.h>
//*/
//import "C"
//
//import (
// "unsafe"
//)
//
//type Mdict struct {
// dict unsafe.Pointer
//}
//
//type SimpleKeyItem struct {
// keyWord string
//}
//
//func MdictInit(dictionaryPath string) *Mdict {
// dictFilePath := C.CString(dictionaryPath)
// defer C.free(unsafe.Pointer(dictFilePath))
// mydict := C.mdict_init(dictFilePath)
// return &Mdict{dict: mydict}
//}
//
//func (m *Mdict) Lookup(word string) string {
// queryWord := C.CString(word)
// defer C.free(unsafe.Pointer(queryWord))
// var result *C.char
// C.mdict_lookup(m.dict, queryWord, &result)
// defer C.free(unsafe.Pointer(result))
// return C.GoString(result)
//}
//
//func (m *Mdict) ParseDefinition(word string, recordStart uint64) string {
// queryWord := C.CString(word)
// defer C.free(unsafe.Pointer(queryWord))
// var result *C.char
// C.mdict_parse_definition(m.dict, queryWord, C.ulong(recordStart), &result)
// defer C.free(unsafe.Pointer(result))
// return C.GoString(result)
//}
//
//func (m *Mdict) KeyList() []*SimpleKeyItem {
// var len C.ulong
// keylist := C.mdict_keylist(m.dict, &len)
// defer C.free(unsafe.Pointer(keylist))
// items := make([]*SimpleKeyItem, len)
// for i := C.ulong(0); i < len; i++ {
// items[i] = &SimpleKeyItem{keyWord: C.GoString((*C.simple_key_item)(unsafe.Pointer(uintptr(unsafe.Pointer(keylist)) + uintptr(i)*unsafe.Sizeof(*keylist))).key_word)}
// }
// return items
//}
//
//func main() {
// // Example usage
// dictionaryPath := "path/to/dictionary"
// mydict := MdictInit(dictionaryPath)
// result := mydict.Lookup("word")
// println(result)
//}
func TestMdictBase_ReadDictFixBug1(t *testing.T) {
mdictBase := &MdictBase{
FilePath: "testdata/bugdict/教育部重編國語辭典(第五版)/教育部重編國語辭典(第五版).mdx",
}
err := mdictBase.ReadDictHeader()
if err != nil {
t.Fatal(err)
}

err = mdictBase.ReadKeyBlockMeta()
if err != nil {
t.Fatal(err)
}

err = mdictBase.ReadKeyBlockInfo()
if err != nil {
t.Fatal(err)
}

err = mdictBase.ReadKeyEntries()
if err != nil {
t.Fatal(err)
}

err = mdictBase.ReadRecordBlockMeta()
if err != nil {
t.Fatal(err)
}

err = mdictBase.ReadRecordBlockInfo()
if err != nil {
t.Fatal(err)
}

t.Logf("key entries list len: %d, record block info entry list len %d", len(mdictBase.KeyBlockData.KeyEntries), len(mdictBase.RecordBlockInfo.RecordInfoList))
t.Logf("entries number size %d\n", mdictBase.KeyBlockData.KeyEntriesSize)
t.Logf("keylist[0] %+v\n", mdictBase.KeyBlockData.KeyEntries[0])

item := mdictBase.KeyBlockData.KeyEntries[0]

data, err := mdictBase.LocateRecordDefinition(item)
if err != nil {
t.Fatal(err)
}
t.Logf("0-0 keyText: %s, data: %s", item.KeyWord, data)

item = mdictBase.KeyBlockData.KeyEntries[1]

data, err = mdictBase.LocateRecordDefinition(item)
if err != nil {
t.Fatal(err)
}
t.Logf("13-0 keyText: %s, data: %s", item.KeyWord, data)

item = mdictBase.KeyBlockData.KeyEntries[3]

data, err = mdictBase.LocateRecordDefinition(item)
if err != nil {
t.Fatal(err)
}
t.Logf("13-7 keyText: %s, data: %s", item.KeyWord, data)

}
9 changes: 9 additions & 0 deletions internal/gomdict/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,17 @@ func littleEndianBinUTF16ToUTF8(bytes []byte, offset int, length int) string {
return string(u8)
}

func min(a, b int) int {
if a > b {
return b
}
return a
}

func bigEndianBinToUTF8(bytes []byte, offset int, length int) string {
cbytes := make([]byte, length)
rawLen := len(bytes)
length = min(rawLen, length)
copy(cbytes, bytes[offset:offset+length])
return string(cbytes)
}
Expand Down
33 changes: 33 additions & 0 deletions internal/utils/encoding_util.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
package utils

import (
"fmt"
"strconv"
"unicode"
)

func StrToUnicode(str string) string {
DD := []rune(str) //需要分割的字符串内容,将它转为字符,然后取长度。
finallStr := ""
for i := 0; i < len(DD); i++ {
if unicode.Is(unicode.Scripts["Han"], DD[i]) {
textQuoted := strconv.QuoteToASCII(string(DD[i]))
finallStr += textQuoted[1 : len(textQuoted)-1]
} else {
h := fmt.Sprintf("%x", DD[i])
finallStr += "\\u" + isFullFour(h)
}
}
return finallStr
}

func isFullFour(str string) string {
if len(str) == 1 {
str = "000" + str
} else if len(str) == 2 {
str = "00" + str
} else if len(str) == 3 {
str = "0" + str
}
return str
}
12 changes: 12 additions & 0 deletions internal/utils/encoding_util_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
package utils

import (
"testing"
)

func TestStrToUnicode(t *testing.T) {
uncodeStr := StrToUnicode("十大户¥@!#%……&……*()——+《》、,。、;‘、配【】")
t.Logf(uncodeStr)
uncodeStr = StrToUnicode("國語詞典")
t.Logf(uncodeStr)
}

0 comments on commit 70ab78e

Please sign in to comment.