From 70ab78ef30dafbcd7af73ca27f06111420211d0c Mon Sep 17 00:00:00 2001 From: terasum Date: Sat, 7 Oct 2023 17:52:00 +0800 Subject: [PATCH] fix: utf16 encoding error --- frontend/src/view/main/MainFunctions.vue | 8 ++ internal/gomdict/mdict.go | 3 + internal/gomdict/mdict_base.go | 18 +++- internal/gomdict/mdict_base_test.go | 126 +++++++++++------------ internal/gomdict/util.go | 9 ++ internal/utils/encoding_util.go | 33 ++++++ internal/utils/encoding_util_test.go | 12 +++ 7 files changed, 141 insertions(+), 68 deletions(-) create mode 100644 internal/utils/encoding_util.go create mode 100644 internal/utils/encoding_util_test.go diff --git a/frontend/src/view/main/MainFunctions.vue b/frontend/src/view/main/MainFunctions.vue index fc28a423..d025d84d 100644 --- a/frontend/src/view/main/MainFunctions.vue +++ b/frontend/src/view/main/MainFunctions.vue @@ -140,6 +140,14 @@ function listenInputWordUpdate() { after((result) => { switch (name) { case 'updateInputSearchWord': { + // inputWord.value = dictQueryStore.inputSearchWord; + break; + } + case 'forwardHistory': { + inputWord.value = dictQueryStore.inputSearchWord; + break; + } + case 'backHistory': { inputWord.value = dictQueryStore.inputSearchWord; break; } diff --git a/internal/gomdict/mdict.go b/internal/gomdict/mdict.go index 593e1c19..bce03c95 100644 --- a/internal/gomdict/mdict.go +++ b/internal/gomdict/mdict.go @@ -19,6 +19,7 @@ package gomdict import ( "errors" "fmt" + "github.com/terasum/medict/internal/utils" "path/filepath" "sort" "strings" @@ -118,6 +119,8 @@ func (mdict *Mdict) Search(word string) ([]*MDictKeyBlockEntry, error) { func (x *MDictKeyBlockEntry) Distance(e bktree.Entry) int { a := x.KeyWord b := e.(*MDictKeyBlockEntry).KeyWord + a = utils.StrToUnicode(a) + b = utils.StrToUnicode(b) return levenshtein.Distance(a, b) } diff --git a/internal/gomdict/mdict_base.go b/internal/gomdict/mdict_base.go index 02271b16..85536def 100644 --- a/internal/gomdict/mdict_base.go +++ b/internal/gomdict/mdict_base.go @@ -21,11 +21,10 @@ import ( "encoding/binary" "errors" "fmt" + "github.com/rasky/go-lzo" "os" "strconv" "strings" - - "github.com/rasky/go-lzo" ) // ReadDictHeader reads the dictionary header. @@ -84,12 +83,13 @@ func (mdict *MdictBase) ReadDictHeader() error { // Handle encoding encoding := headerInfo.Encoding + encoding = strings.ToLower(encoding) switch encoding { - case "GBK", "GB2312": + case "GBK", "GB2312", "gbk", "gb2312": meta.Encoding = EncodingGb18030 - case "Big5", "BIG5": + case "Big5", "BIG5", "big5": meta.Encoding = EncodingBig5 - case "utf16", "utf-16": + case "utf16", "utf-16", "UTF-16": meta.Encoding = EncodingUtf16 default: meta.Encoding = EncodingUtf8 @@ -624,6 +624,14 @@ func (mdict *MdictBase) splitKeyBlock(keyBlock []byte) []*MDictKeyBlockEntry { keyTextBytes := keyBlock[keyStartIndex+mdict.Meta.NumberWidth : keyEndIndex] keyText := string(keyTextBytes) var err error + + if mdict.Meta.Encoding == EncodingUtf16 { + keyText, err = decodeLittleEndianUtf16(keyTextBytes) + if err != nil { + keyText = string(keyTextBytes) + } + } + if mdict.FileType == MdictTypeMdd { keyText, err = decodeLittleEndianUtf16(keyTextBytes) if err != nil { diff --git a/internal/gomdict/mdict_base_test.go b/internal/gomdict/mdict_base_test.go index 8795d7d8..6197a9fe 100644 --- a/internal/gomdict/mdict_base_test.go +++ b/internal/gomdict/mdict_base_test.go @@ -159,66 +159,66 @@ func TestMdictBase_ReadDictHeader3(t *testing.T) { } -// -///* -//#include "mdict_extern.h" -//#include "mdict.h" -//#include -//*/ -//import "C" -// -//import ( -// "unsafe" -//) -// -//type Mdict struct { -// dict unsafe.Pointer -//} -// -//type SimpleKeyItem struct { -// keyWord string -//} -// -//func MdictInit(dictionaryPath string) *Mdict { -// dictFilePath := C.CString(dictionaryPath) -// defer C.free(unsafe.Pointer(dictFilePath)) -// mydict := C.mdict_init(dictFilePath) -// return &Mdict{dict: mydict} -//} -// -//func (m *Mdict) Lookup(word string) string { -// queryWord := C.CString(word) -// defer C.free(unsafe.Pointer(queryWord)) -// var result *C.char -// C.mdict_lookup(m.dict, queryWord, &result) -// defer C.free(unsafe.Pointer(result)) -// return C.GoString(result) -//} -// -//func (m *Mdict) ParseDefinition(word string, recordStart uint64) string { -// queryWord := C.CString(word) -// defer C.free(unsafe.Pointer(queryWord)) -// var result *C.char -// C.mdict_parse_definition(m.dict, queryWord, C.ulong(recordStart), &result) -// defer C.free(unsafe.Pointer(result)) -// return C.GoString(result) -//} -// -//func (m *Mdict) KeyList() []*SimpleKeyItem { -// var len C.ulong -// keylist := C.mdict_keylist(m.dict, &len) -// defer C.free(unsafe.Pointer(keylist)) -// items := make([]*SimpleKeyItem, len) -// for i := C.ulong(0); i < len; i++ { -// items[i] = &SimpleKeyItem{keyWord: C.GoString((*C.simple_key_item)(unsafe.Pointer(uintptr(unsafe.Pointer(keylist)) + uintptr(i)*unsafe.Sizeof(*keylist))).key_word)} -// } -// return items -//} -// -//func main() { -// // Example usage -// dictionaryPath := "path/to/dictionary" -// mydict := MdictInit(dictionaryPath) -// result := mydict.Lookup("word") -// println(result) -//} +func TestMdictBase_ReadDictFixBug1(t *testing.T) { + mdictBase := &MdictBase{ + FilePath: "testdata/bugdict/教育部重編國語辭典(第五版)/教育部重編國語辭典(第五版).mdx", + } + err := mdictBase.ReadDictHeader() + if err != nil { + t.Fatal(err) + } + + err = mdictBase.ReadKeyBlockMeta() + if err != nil { + t.Fatal(err) + } + + err = mdictBase.ReadKeyBlockInfo() + if err != nil { + t.Fatal(err) + } + + err = mdictBase.ReadKeyEntries() + if err != nil { + t.Fatal(err) + } + + err = mdictBase.ReadRecordBlockMeta() + if err != nil { + t.Fatal(err) + } + + err = mdictBase.ReadRecordBlockInfo() + if err != nil { + t.Fatal(err) + } + + t.Logf("key entries list len: %d, record block info entry list len %d", len(mdictBase.KeyBlockData.KeyEntries), len(mdictBase.RecordBlockInfo.RecordInfoList)) + t.Logf("entries number size %d\n", mdictBase.KeyBlockData.KeyEntriesSize) + t.Logf("keylist[0] %+v\n", mdictBase.KeyBlockData.KeyEntries[0]) + + item := mdictBase.KeyBlockData.KeyEntries[0] + + data, err := mdictBase.LocateRecordDefinition(item) + if err != nil { + t.Fatal(err) + } + t.Logf("0-0 keyText: %s, data: %s", item.KeyWord, data) + + item = mdictBase.KeyBlockData.KeyEntries[1] + + data, err = mdictBase.LocateRecordDefinition(item) + if err != nil { + t.Fatal(err) + } + t.Logf("13-0 keyText: %s, data: %s", item.KeyWord, data) + + item = mdictBase.KeyBlockData.KeyEntries[3] + + data, err = mdictBase.LocateRecordDefinition(item) + if err != nil { + t.Fatal(err) + } + t.Logf("13-7 keyText: %s, data: %s", item.KeyWord, data) + +} diff --git a/internal/gomdict/util.go b/internal/gomdict/util.go index 3a4a3364..8b8943ee 100644 --- a/internal/gomdict/util.go +++ b/internal/gomdict/util.go @@ -48,8 +48,17 @@ func littleEndianBinUTF16ToUTF8(bytes []byte, offset int, length int) string { return string(u8) } +func min(a, b int) int { + if a > b { + return b + } + return a +} + func bigEndianBinToUTF8(bytes []byte, offset int, length int) string { cbytes := make([]byte, length) + rawLen := len(bytes) + length = min(rawLen, length) copy(cbytes, bytes[offset:offset+length]) return string(cbytes) } diff --git a/internal/utils/encoding_util.go b/internal/utils/encoding_util.go new file mode 100644 index 00000000..159786e6 --- /dev/null +++ b/internal/utils/encoding_util.go @@ -0,0 +1,33 @@ +package utils + +import ( + "fmt" + "strconv" + "unicode" +) + +func StrToUnicode(str string) string { + DD := []rune(str) //需要分割的字符串内容,将它转为字符,然后取长度。 + finallStr := "" + for i := 0; i < len(DD); i++ { + if unicode.Is(unicode.Scripts["Han"], DD[i]) { + textQuoted := strconv.QuoteToASCII(string(DD[i])) + finallStr += textQuoted[1 : len(textQuoted)-1] + } else { + h := fmt.Sprintf("%x", DD[i]) + finallStr += "\\u" + isFullFour(h) + } + } + return finallStr +} + +func isFullFour(str string) string { + if len(str) == 1 { + str = "000" + str + } else if len(str) == 2 { + str = "00" + str + } else if len(str) == 3 { + str = "0" + str + } + return str +} diff --git a/internal/utils/encoding_util_test.go b/internal/utils/encoding_util_test.go new file mode 100644 index 00000000..dc39460f --- /dev/null +++ b/internal/utils/encoding_util_test.go @@ -0,0 +1,12 @@ +package utils + +import ( + "testing" +) + +func TestStrToUnicode(t *testing.T) { + uncodeStr := StrToUnicode("十大户¥@!#%……&……*()——+《》、,。、;‘、配【】") + t.Logf(uncodeStr) + uncodeStr = StrToUnicode("國語詞典") + t.Logf(uncodeStr) +}