From da0a1a042d73f131e6131523787f163387be66cb Mon Sep 17 00:00:00 2001 From: JoungKyun Kim Date: Thu, 1 Aug 2019 03:11:56 +0900 Subject: [PATCH] fixed #13 Report utf-8-sig --- README.md | 23 +++++++++++- man/en/detect.3 | 10 ++--- man/en/detect_destroy.3 | 10 ++--- man/en/detect_handledata.3 | 10 ++--- man/en/detect_init.3 | 10 ++--- man/en/detect_obj_free.3 | 10 ++--- man/en/detect_obj_init.3 | 11 +++--- man/en/detect_reset.3 | 10 ++--- man/ko/detect.3 | 11 +++--- man/ko/detect_destroy.3 | 10 ++--- man/ko/detect_handledata.3 | 10 ++--- man/ko/detect_init.3 | 12 +++--- man/ko/detect_obj_free.3 | 12 +++--- man/ko/detect_obj_init.3 | 13 ++++--- man/ko/detect_reset.3 | 12 +++--- src/chardet.cpp | 8 ++-- src/chardet.h | 8 ++-- src/nsUniversalDetector.cpp | 73 ++++++++++++++++++++++++++++++++++++- src/nsUniversalDetector.h | 4 +- test/Makefile | 15 +++++--- test/bom-test.c | 65 +++++++++++++++++++++++++++++++++ test/sample.c | 2 +- test/sample1.c | 2 +- test/utf-8-bom.txt | 1 + test/utf-8.txt | 1 + 25 files changed, 261 insertions(+), 92 deletions(-) create mode 100644 test/bom-test.c create mode 100644 test/utf-8-bom.txt create mode 100644 test/utf-8.txt diff --git a/README.md b/README.md index 3659ffb..0734ae1 100644 --- a/README.md +++ b/README.md @@ -24,6 +24,11 @@ From 1.0.5, libchardet was reflected single-byte charset detection confidence algorithm of [uchardet](https://github.com/BYVoid/uchardet/) and new language models. (Arabic, Danish, Esperanto, German, Spanish, Turkish, Vietnamese) +From 1.0.6, bom members have been added to the DetectObj structure. +The value of the bom member is 1, which means that it has been detected as a BOM. +Support for bom member can be determined by the existence of the CHARDET_BOM_CHECK +constant. See example below. + ## Installation See also [INSTALL](INSTALL) document @@ -63,10 +68,18 @@ See also test directory of source code return CHARDET_NULL_OBJECT; } + #ifndef CHARDET_BOM_CHECK printf ("encoding: %s, confidence: %f\n", obj->encoding, obj->confidence); + #else + // from 1.0.6 support return whether exists BOM + printf ( + "encoding: %s, confidence: %f, exist BOM: %d\n", + obj->encoding, obj->confidence, obj->bom + ); + #endif detect_obj_free (&obj); - return 0; + return 0; } ``` @@ -112,7 +125,15 @@ or looping code return CHARDET_NULL_OBJECT; } + #ifndef CHARDET_BOM_CHECK printf ("encoding: %s, confidence: %f\n", obj->encoding, obj->confidence); + #else + // from 1.0.6 support return whether exists BOM + printf ( + "encoding: %s, confidence: %f, exist BOM: %d\n", + obj->encoding, obj->confidence, obj->bom + ); + #endif detect_obj_free (&obj); if ( 1 ) diff --git a/man/en/detect.3 b/man/en/detect.3 index 55d2261..2ec6fbe 100644 --- a/man/en/detect.3 +++ b/man/en/detect.3 @@ -1,8 +1,7 @@ -.TH detect 3 2015-12-11 "libchardet manuals" +.TH detect 3 2019-08-01 "libchardet manuals" .\" Process with .\" nroff -man detect.3 -.\" 2016-05-05 JoungKyun.Kim -.\" $Id$ +.\" 2019-08-01 JoungKyun.Kim .SH NAME detect, detect_r \- Detecting character set and measuring accuracy of charset @@ -113,7 +112,8 @@ int main (void) { return CHARDET_NULL_OBJECT; } - printf ("encoding: %s, confidence: %f\\n", obj->encoding, obj->confidence); + # check support obj->bom with CHARDET_BOM_CHECK constant + printf ("encoding: %s, confidence: %f, exists bom: %d\\n", obj->encoding, obj->confidence, obj->bom); detect_obj_free (&obj); return 0; @@ -124,7 +124,7 @@ int main (void) { JoungKyun.Kim .SH "BUG REPORTS" -Use QnA board on http://oops.org +Use QnA board on https://github.com/Joungkyun/libchardet/issues .SH "SEE ALSO" detect_handledata(3), detect_obj_init(3), detect_obj_free(3) diff --git a/man/en/detect_destroy.3 b/man/en/detect_destroy.3 index 7a87dea..23c6657 100644 --- a/man/en/detect_destroy.3 +++ b/man/en/detect_destroy.3 @@ -1,8 +1,7 @@ -.TH detect_destroy 3 2015-12-11 "libchardet manuals" +.TH detect_destroy 3 2019-08-01 "libchardet manuals" .\" Process with .\" nroff -man detect_destroy.3 -.\" 2015-12-11 JoungKyun Kim -.\" $Id$ +.\" 2019-08-01 JoungKyun Kim .SH NAME detect_destroy \- free Detector structure @@ -52,7 +51,8 @@ int main (void) { return CHARDET_NULL_OBJECT; } - printf ("encoding: %s, confidence: %f\\n", obj->encoding, obj->confidence); + # check support obj->bom with CHARDET_BOM_CHECK constant + printf ("encoding: %s, confidence: %f, exists bom: %d\\n", obj->encoding, obj->confidence, obj->bom); detect_obj_free (&obj); detect_destroy (&d); @@ -64,7 +64,7 @@ int main (void) { JoungKyun.Kim .SH "BUG REPORTS" -Use QnA board on http://oops.org +Use QnA board on https://github.com/Joungkyun/libchardet/issues .SH "SEE ALSO" detect_init(3), detect_reset(3) diff --git a/man/en/detect_handledata.3 b/man/en/detect_handledata.3 index 2f3d3f4..483ccbe 100644 --- a/man/en/detect_handledata.3 +++ b/man/en/detect_handledata.3 @@ -1,8 +1,7 @@ -.TH detect_handledata 3 2015-12-11 "libchardet manuals" +.TH detect_handledata 3 2019-08-01 "libchardet manuals" .\" Process with .\" nroff -man detect_handledata.3 -.\" 2016-05-05 JoungKyun.Kim -.\" $Id$ +.\" 2019-08-01 JoungKyun.Kim .SH NAME detect_handledata, detect_handledata_r \- Detecting character set and measuring accuracy of charset @@ -136,7 +135,8 @@ int main (void) { return CHARDET_NULL_OBJECT; } - printf ("encoding: %s, confidence: %f\\n", obj->encoding, obj->confidence); + # check support obj->bom with CHARDET_BOM_CHECK constant + printf ("encoding: %s, confidence: %f, exists bom: %d\\n", obj->encoding, obj->confidence, obj->bom); detect_obj_free (&obj); } detect_destroy (&d); @@ -149,7 +149,7 @@ int main (void) { JoungKyun.Kim .SH "BUG REPORTS" -Use QnA board on http://oops.org +Use QnA board on https://github.com/Joungkyun/libchardet/issues .SH "SEE ALSO" detect_obj_init(3), detect_obj_free(3), detect_init(3), detect_reset(3), detect_destroy(3) diff --git a/man/en/detect_init.3 b/man/en/detect_init.3 index c4f7f23..fe40bae 100644 --- a/man/en/detect_init.3 +++ b/man/en/detect_init.3 @@ -1,8 +1,7 @@ -.TH detect_init 3 2015-12-11 "libchardet manuals" +.TH detect_init 3 2019-08-01 "libchardet manuals" .\" Process with .\" nroff -man detect_init.3 -.\" 2015-12-11 JoungKyun Kim -.\" $Id$ +.\" 2019-08-01 JoungKyun Kim .SH NAME detect_init - initialize Detect structure @@ -62,7 +61,8 @@ int main (void) { return CHARDET_NULL_OBJECT; } - printf ("encoding: %s, confidence: %f\\n", obj->encoding, obj->confidence); + # check support obj->bom with CHARDET_BOM_CHECK constant + printf ("encoding: %s, confidence: %f, exists bom: %d\\n", obj->encoding, obj->confidence, obj->bom); detect_obj_free (&obj); detect_destroy (&d); @@ -74,7 +74,7 @@ int main (void) { JoungKyun.Kim .SH "BUG REPORTS" -Use QnA board on http://oops.org +Use QnA board on https://github.com/Joungkyun/libchardet/issues .SH "SEE ALSO" detect_obj_init(3), detect_obj_free(3), detect_reset(3), detect_handledata(3), detect_destroy(3) diff --git a/man/en/detect_obj_free.3 b/man/en/detect_obj_free.3 index 6e31081..017c56a 100644 --- a/man/en/detect_obj_free.3 +++ b/man/en/detect_obj_free.3 @@ -1,8 +1,7 @@ -.TH detect_obj_free 3 2015-12-11 "libchardet manuals" +.TH detect_obj_free 3 2019-08-01 "libchardet manuals" .\" Process with .\" nroff -man detect_obj_free.3 -.\" 2015-12-11 JoungKyun Kim -.\" $Id$ +.\" 2019-08-01 JoungKyun Kim .SH NAME chardet_obj_free - free DetectObject structure @@ -45,7 +44,8 @@ int main (void) { return CHARDET_NULL_OBJECT; } - printf ("encoding: %s, confidence: %f\\n", obj->encoding, obj->confidence); + # check support obj->bom with CHARDET_BOM_CHECK constant + printf ("encoding: %s, confidence: %f, exists bom: %d\\n", obj->encoding, obj->confidence, obj->bom); detect_obj_free (&obj); return 0; @@ -56,7 +56,7 @@ int main (void) { JoungKyun.Kim .SH "BUG REPORTS" -Use QnA board on http://oops.org +Use QnA board on https://github.com/Joungkyun/libchardet/issues .SH "SEE ALSO" detect_obj_init(3) diff --git a/man/en/detect_obj_init.3 b/man/en/detect_obj_init.3 index c63cfd2..d081488 100644 --- a/man/en/detect_obj_init.3 +++ b/man/en/detect_obj_init.3 @@ -1,8 +1,7 @@ -.TH detect_obj_init 3 2015-12-11 "libchardet manuals" +.TH detect_obj_init 3 2019-08-01 "libchardet manuals" .\" Process with .\" nroff -man detect_obj_init.3 -.\" 2015-12-11 JoungKyun Kim -.\" $Id$ +.\" 2019-08-01 JoungKyun Kim .SH NAME detect_obj_init - initialize DetectObject structure @@ -28,6 +27,7 @@ api. typedef struct DetectObject { char * encoding; float confidence; + short bom; } DetectObj; .PP @@ -55,7 +55,8 @@ int main (void) { return CHARDET_NULL_OBJECT; } - printf ("encoding: %s, confidence: %f\\n", obj->encoding, obj->confidence); + # check support obj->bom with CHARDET_BOM_CHECK constant + printf ("encoding: %s, confidence: %f, exists bom: %d\\n", obj->encoding, obj->confidence, obj->bom); detect_obj_free (&obj); return 0; @@ -66,7 +67,7 @@ int main (void) { JoungKyun.Kim .SH "BUG REPORTS" -Use QnA board on http://oops.org +Use QnA board on https://github.com/Joungkyun/libchardet/issues .SH "SEE ALSO" detect_obj_free(3), detect(3), detect_handledata(3) diff --git a/man/en/detect_reset.3 b/man/en/detect_reset.3 index d11ceb3..cf639f9 100644 --- a/man/en/detect_reset.3 +++ b/man/en/detect_reset.3 @@ -1,8 +1,7 @@ -.TH detect_reset 3 2015-12-11 "libchardet manuals" +.TH detect_reset 3 2019-08-01 "libchardet manuals" .\" Process with .\" nroff -man detect_reset.3 -.\" 2015-12-11 JoungKyun Kim -.\" $Id$ +.\" 2019-08-01 JoungKyun Kim .SH NAME detect_reset - reset Detect structure @@ -59,7 +58,8 @@ int main (void) { return CHARDET_NULL_OBJECT; } - printf ("encoding: %s, confidence: %f\\n", obj->encoding, obj->confidence); + # check support obj->bom with CHARDET_BOM_CHECK constant + printf ("encoding: %s, confidence: %f, exists bom: %d\\n", obj->encoding, obj->confidence, obj->bom); detect_obj_free (&obj); } detect_destroy (&d); @@ -72,7 +72,7 @@ int main (void) { JoungKyun.Kim .SH "BUG REPORTS" -Use QnA board on http://oops.org +Use QnA board on https://github.com/Joungkyun/libchardet/issues .SH "SEE ALSO" detect_obj_init(3), detect_obj_free(3), detect_init(3), detect_handledata(3), detect_destroy(3) diff --git a/man/ko/detect.3 b/man/ko/detect.3 index 2834fcc..08f9cbb 100644 --- a/man/ko/detect.3 +++ b/man/ko/detect.3 @@ -1,8 +1,7 @@ -.TH detect 3 2016-05-05 "libchardet manuals" +.TH detect 3 2019-08-01 "libchardet manuals" .\" Process with .\" nroff -man detect.3 -.\" 2016-05-05 JoungKyun.Kim -.\" $Id$ +.\" 2019-08-01 JoungKyun.Kim .SH 이름 detect, detect_r \- 문자열의 문자셋과 정확도를 측정 @@ -46,6 +45,7 @@ API를 이용하십시오. typedef struct DetectObject { char * encoding; float confidence; + short bom; } DetectObj; .fi @@ -109,7 +109,8 @@ int main (void) { return CHARDET_NULL_OBJECT; } - printf ("encoding: %s, confidence: %f\\n", obj->encoding, obj->confidence); + # obj-bom 은 CHARDET_BOM_CHECK 상수 지원여부로 지원을 판단할 수 있습니다. + printf ("encoding: %s, confidence: %f, exists bom: %d\\n", obj->encoding, obj->confidence, obj->bom); detect_obj_free (&obj); return 0; @@ -120,7 +121,7 @@ int main (void) { 김정균 .SH 버그 리포트 - 의 QnA 게시판을 이용한다. +https://github.com/Joungkyun/libchardet/issues .SH "참고" detect_handledata(3), detect_obj_init(3), detect_obj_free(3) diff --git a/man/ko/detect_destroy.3 b/man/ko/detect_destroy.3 index 259472b..6343c05 100644 --- a/man/ko/detect_destroy.3 +++ b/man/ko/detect_destroy.3 @@ -1,8 +1,7 @@ -.TH detect_destroy 3 2015-12-11 "libchardet manuals" +.TH detect_destroy 3 2019-08-01 "libchardet manuals" .\" Process with .\" nroff -man detect_destroy.3 -.\" 2015-12-11 JoungKyun Kim -.\" $Id$ +.\" 2019-08-01 JoungKyun Kim .SH 이름 detect_destroy \- chardet resource를 해제한다. @@ -51,7 +50,8 @@ int main (void) { return CHARDET_NULL_OBJECT; } - printf ("encoding: %s, confidence: %f\\n", obj->encoding, obj->confidence); + # obj-bom 은 CHARDET_BOM_CHECK 상수 지원여부로 지원을 판단할 수 있습니다. + printf ("encoding: %s, confidence: %f, exists bom: %d\\n", obj->encoding, obj->confidence, obj->bom); detect_obj_free (&obj); detect_destroy (&d); @@ -63,7 +63,7 @@ int main (void) { 김정균 .SH 버그 리포트 - 의 QnA 게시판을 이용한다. +https://github.com/Joungkyun/libchardet/issues .SH "참고" detect_init(3), detect_reset(3) diff --git a/man/ko/detect_handledata.3 b/man/ko/detect_handledata.3 index 2fa17ff..47d61f6 100644 --- a/man/ko/detect_handledata.3 +++ b/man/ko/detect_handledata.3 @@ -1,8 +1,7 @@ -.TH detect_handledata 3 2016-05-05 "libchardet manuals" +.TH detect_handledata 3 2019-08-01 "libchardet manuals" .\" Process with .\" nroff -man detect_handledata.3 -.\" 2016-05-05 JoungKyun.Kim -.\" $Id$ +.\" 2019-08-01 JoungKyun.Kim .SH 이름 detect_handledata, detect_handledata_r \- 문자셋과 정확도를 측정 @@ -131,7 +130,8 @@ int main (void) { return CHARDET_NULL_OBJECT; } - printf ("encoding: %s, confidence: %f\\n", obj->encoding, obj->confidence); + # obj-bom 은 CHARDET_BOM_CHECK 상수 지원여부로 지원을 판단할 수 있습니다. + printf ("encoding: %s, confidence: %f, exists bom: %d\\n", obj->encoding, obj->confidence, obj->bom); detect_obj_free (&obj); } detect_destroy (&d); @@ -144,7 +144,7 @@ int main (void) { 김정균 .SH 버그 리포트 - 의 QnA 게시판을 이용한다. +https://github.com/Joungkyun/libchardet/issues .SH "참고" detect_obj_init(3), detect_obj_free(3), detect_init(3), detect_reset(3), detect_destroy(3) diff --git a/man/ko/detect_init.3 b/man/ko/detect_init.3 index e27d447..98cc347 100644 --- a/man/ko/detect_init.3 +++ b/man/ko/detect_init.3 @@ -1,8 +1,7 @@ -.TH detect_init 3 2015-12-11 "libchardet manuals" +.TH detect_init 3 2019-08-01 "libchardet manuals" .\" Process with .\" nroff -man detect_init.3 -.\" 2015-12-11 JoungKyun Kim -.\" $Id$ +.\" 2019-08-01 JoungKyun Kim .SH 이름 detect_init - chardet file handle 초기화 @@ -52,7 +51,8 @@ int main (void) { return CHARDET_NULL_OBJECT; } - printf ("encoding: %s, confidence: %f\\n", obj->encoding, obj->confidence); + # obj-bom 은 CHARDET_BOM_CHECK 상수 지원여부로 지원을 판단할 수 있습니다. + printf ("encoding: %s, confidence: %f, exists bom: %d\\n", obj->encoding, obj->confidence, obj->bom); detect_obj_free (&obj); detect_destroy (&d); @@ -62,9 +62,9 @@ int main (void) { .SH 저자 김정균 .SH 버그 리포트 - 의 QnA 게시판을 이용한다. +https://github.com/Joungkyun/libchardet/issues .SH 저작권 -Copyright (c) 2017 JoungKyun.Kim +Copyright (c) 2019 JoungKyun.Kim 이 프로그램은 MPL/GPL2/LGPL2.1 을 따르며, 사용시의 어떠한 문제에 대하여 보증하지 않는다. .SH "참고" diff --git a/man/ko/detect_obj_free.3 b/man/ko/detect_obj_free.3 index 90ad6f4..ef1281c 100644 --- a/man/ko/detect_obj_free.3 +++ b/man/ko/detect_obj_free.3 @@ -1,8 +1,7 @@ -.TH detect_obj_free 3 2015-12-11 "libchardet manuals" +.TH detect_obj_free 3 2019-08-01 "libchardet manuals" .\" Process with .\" nroff -man detect_obj_free.3 -.\" 2015-12-11 JoungKyun Kim -.\" $Id$ +.\" 2019-08-01 JoungKyun Kim .SH 이름 chardet_obj_free - chardet_obj_init 의 return value memory 해제 @@ -40,7 +39,8 @@ int main (void) { return CHARDET_NULL_OBJECT; } - printf ("encoding: %s, confidence: %f\\n", obj->encoding, obj->confidence); + # obj-bom 은 CHARDET_BOM_CHECK 상수 지원여부로 지원을 판단할 수 있습니다. + printf ("encoding: %s, confidence: %f, exists bom: %d\\n", obj->encoding, obj->confidence, obj->bom); detect_obj_free (&obj); return 0; @@ -49,9 +49,9 @@ int main (void) { .SH 저자 김정균 .SH 버그 리포트 - 의 QnA 게시판을 이용한다. +https://github.com/Joungkyun/libchardet/issues .SH 저작권 -Copyright (c) 2017 JoungKyun.Kim +Copyright (c) 2019 JoungKyun.Kim 이 프로그램은 MPL/GPL2/LGPL2.1 을 따르며, 사용시의 어떠한 문제에 대하여 보증하지 않는다. .SH "참고" diff --git a/man/ko/detect_obj_init.3 b/man/ko/detect_obj_init.3 index 8b3fb84..04ca738 100644 --- a/man/ko/detect_obj_init.3 +++ b/man/ko/detect_obj_init.3 @@ -1,8 +1,7 @@ -.TH detect_obj_init 3 2015-12-11 "libchardet manuals" +.TH detect_obj_init 3 2019-08-01 "libchardet manuals" .\" Process with .\" nroff -man detect_obj_init.3 -.\" 2015-12-11 JoungKyun Kim -.\" $Id$ +.\" 2019-08-01 JoungKyun Kim .SH 이름 detect_obj_init - libchardet 의 결과 값 structure 초기화 @@ -20,6 +19,7 @@ DetectObject struct 를 반환한다. 반환된 값은 chardet_obj_free API 를 typedef struct DetectObject { char * encoding; float confidence; + short bom; } DetectObj; .PP .SH 예제 @@ -46,7 +46,8 @@ int main (void) { return CHARDET_NULL_OBJECT; } - printf ("encoding: %s, confidence: %f\\n", obj->encoding, obj->confidence); + # obj-bom 은 CHARDET_BOM_CHECK 상수 지원여부로 지원을 판단할 수 있습니다. + printf ("encoding: %s, confidence: %f, exists bom: %d\\n", obj->encoding, obj->confidence, obj->bom); detect_obj_free (&obj); return 0; @@ -55,9 +56,9 @@ int main (void) { .SH 저자 김정균 .SH 버그 리포트 - 의 QnA 게시판을 이용한다. +https://github.com/Joungkyun/libchardet/issues .SH 저작권 -Copyright (c) 2017 JoungKyun.Kim +Copyright (c) 2019 JoungKyun.Kim 이 프로그램은 MPL/GPL2/LGPL2.1 을 따르며, 사용시의 어떠한 문제에 대하여 보증하지 않는다. .SH "참고" diff --git a/man/ko/detect_reset.3 b/man/ko/detect_reset.3 index ef08cbc..0e8410d 100644 --- a/man/ko/detect_reset.3 +++ b/man/ko/detect_reset.3 @@ -1,8 +1,7 @@ -.TH detect_reset 3 2015-12-11 "libchardet manuals" +.TH detect_reset 3 2019-08-01 "libchardet manuals" .\" Process with .\" nroff -man detect_reset.3 -.\" 2015-12-11 JoungKyun Kim -.\" $Id$ +.\" 2019-08-01 JoungKyun Kim .SH 이름 detect_reset - chardet file handle reset @@ -55,7 +54,8 @@ int main (void) { return CHARDET_NULL_OBJECT; } - printf ("encoding: %s, confidence: %f\\n", obj->encoding, obj->confidence); + # obj-bom 은 CHARDET_BOM_CHECK 상수 지원여부로 지원을 판단할 수 있습니다. + printf ("encoding: %s, confidence: %f, exists bom: %d\\n", obj->encoding, obj->confidence, obj->bom); detect_obj_free (&obj); } detect_destroy (&d); @@ -66,9 +66,9 @@ int main (void) { .SH 저자 김정균 .SH 버그 리포트 - 의 QnA 게시판을 이용한다. +https://github.com/Joungkyun/libchardet/issues .SH 저작권 -Copyright (c) 2017 JoungKyun.Kim +Copyright (c) 2019 JoungKyun.Kim 이 프로그램은 MPL/GPL2/LGPL2.1 을 따르며, 사용시의 어떠한 문제에 대하여 보증하지 않는다. .SH "참고" diff --git a/src/chardet.cpp b/src/chardet.cpp index c2e3078..58650b7 100644 --- a/src/chardet.cpp +++ b/src/chardet.cpp @@ -16,8 +16,6 @@ * Detect class by John Gardiner Myers * C wrapping API by JoungKyun.Kim * - * $Id$ - * * Alternatively, the contents of this file may be used under the terms of * either the GNU General Public License Version 2 or later (the "GPL"), or * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), @@ -43,6 +41,7 @@ class Detector: public nsUniversalDetector { virtual ~Detector () {} const char *getCharsetName () { return mDetectedCharset; } float getConfidence () { return mDetectedConfidence; } + short getIsBOM () { return mDetectedIsBOM; } virtual void Reset () { this->nsUniversalDetector::Reset (); } protected: virtual void Report (const char* aCharset) { mDetectedCharset = aCharset; } @@ -70,6 +69,7 @@ CHARDET_API DetectObj * detect_obj_init (void) { obj->encoding = NULL; obj->confidence = 0.0; + obj->bom = 0; return obj; } @@ -121,6 +121,7 @@ CHARDET_API short detect_handledata_r (Detect ** det, const char * buf, size_t b (*obj)->encoding = (char *) strdup (ret); (*obj)->confidence = (*det)->detect->getConfidence (); + (*obj)->bom = (*det)->detect->getIsBOM (); return CHARDET_SUCCESS; } @@ -156,6 +157,7 @@ CHARDET_API short detect_r (const char *buf, size_t buflen, DetectObj ** obj) { (*obj)->encoding = (char *) strdup (ret); (*obj)->confidence = det->getConfidence (); + (*obj)->bom = det->getIsBOM (); return CHARDET_SUCCESS; } @@ -165,6 +167,6 @@ CHARDET_API short detect_r (const char *buf, size_t buflen, DetectObj ** obj) { * tab-width: 4 * c-basic-offset: 4 * End: - * vim600: noet sw=4 ts=4 fdm=marker + * vim: noet sw=4 ts=4 fdm=marker * vim<600: noet sw=4 ts=4 */ diff --git a/src/chardet.h b/src/chardet.h index 84975a3..ff923aa 100644 --- a/src/chardet.h +++ b/src/chardet.h @@ -16,8 +16,6 @@ * Detect class by John Gardiner Myers * C wrapping API by JoungKyun.Kim * - * $Id$ - * * Alternatively, the contents of this file may be used under the terms of * either the GNU General Public License Version 2 or later (the "GPL"), or * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), @@ -81,6 +79,9 @@ // whether to support detect_r and detect_handledata_r API #define CHARDET_BINARY_SAFE 1 +// whether to support bom member of DetectObj structure +#define CHARDET_BOM_CHECK 1 + #ifdef __cplusplus extern "C" { #endif @@ -89,6 +90,7 @@ extern "C" { typedef struct DetectObject { char * encoding; float confidence; + short bom; } DetectObj; CHARDET_API char * detect_version (void); @@ -116,6 +118,6 @@ extern "C" { * tab-width: 4 * c-basic-offset: 4 * End: - * vim600: noet sw=4 ts=4 fdm=marker + * vim: noet sw=4 ts=4 fdm=marker * vim<600: noet sw=4 ts=4 */ diff --git a/src/nsUniversalDetector.cpp b/src/nsUniversalDetector.cpp index f0dded8..7080149 100644 --- a/src/nsUniversalDetector.cpp +++ b/src/nsUniversalDetector.cpp @@ -1,5 +1,5 @@ /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- - * vim600: et sw=2 ts=2 fdm=marker + * vim: et sw=2 ts=2 fdm=marker */ /* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 @@ -25,8 +25,8 @@ * Shy Shalom * JoungKyun.Kim * - Add mDetectedConfidence + * - Add mDetectedIsBOM * - * $Id$ * * Alternatively, the contents of this file may be used under the terms of * either the GNU General Public License Version 2 or later (the "GPL"), or @@ -62,6 +62,7 @@ nsUniversalDetector::nsUniversalDetector(PRUint32 aLanguageFilter) mStart = PR_TRUE; mDetectedCharset = nsnull; mDetectedConfidence = 0.0; + mDetectedIsBOM = 0; mGotData = PR_FALSE; mInputState = ePureAscii; mLastChar = '\0'; @@ -91,6 +92,7 @@ nsUniversalDetector::Reset() mStart = PR_TRUE; mDetectedCharset = nsnull; mDetectedConfidence = 0.0; + mDetectedIsBOM = 0; mGotData = PR_FALSE; mInputState = ePureAscii; mLastChar = '\0'; @@ -128,6 +130,7 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen) // EF BB BF UTF-8 encoded BOM mDetectedCharset = "UTF-8"; mDetectedConfidence = 1.0; + mDetectedIsBOM = 1; } break; case '\xFE': @@ -135,10 +138,12 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen) // FE FF 00 00 UCS-4, unusual octet order BOM (3412) mDetectedCharset = "X-ISO-10646-UCS-4-3412"; mDetectedConfidence = 1.0; + mDetectedIsBOM = 1; } else if ('\xFF' == aBuf[1]) { // FE FF UTF-16, big endian BOM mDetectedCharset = "UTF-16BE"; mDetectedConfidence = 1.0; + mDetectedIsBOM = 1; } break; case '\x00': @@ -146,10 +151,12 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen) // 00 00 FE FF UTF-32, big-endian BOM mDetectedCharset = "UTF-32BE"; mDetectedConfidence = 1.0; + mDetectedIsBOM = 1; } else if (('\x00' == aBuf[1]) && ('\xFF' == aBuf[2]) && ('\xFE' == aBuf[3])) { // 00 00 FF FE UCS-4, unusual octet order BOM (2143) mDetectedCharset = "X-ISO-10646-UCS-4-2143"; mDetectedConfidence = 1.0; + mDetectedIsBOM = 1; } break; case '\xFF': @@ -157,12 +164,73 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen) // FF FE 00 00 UTF-32, little-endian BOM mDetectedCharset = "UTF-32LE"; mDetectedConfidence = 1.0; + mDetectedIsBOM = 1; } else if ('\xFE' == aBuf[1]) { // FF FE UTF-16, little endian BOM mDetectedCharset = "UTF-16LE"; mDetectedConfidence = 1.0; + mDetectedIsBOM = 1; } break; + case '\x2B': + if (('\x2F' == aBuf[1]) && ('\x76' == aBuf[2])) { + switch (aBuf[3]) { + case '\x38': + case '\x39': + case '\x2B': + case '\x2F': + // https://en.wikipedia.org/wiki/Byte_order_mark#Byte_order_marks_by_encoding + // 2B 2F 76 38 UTF-7 + // 2B 2F 76 39 UTF-7 + // 2B 2F 76 2B UTF-7 + // 2B 2F 76 2F UTF-7 + mDetectedCharset = "UTF-7"; + mDetectedConfidence = 1.0; + mDetectedIsBOM = 1; + break; + } + } + break; + case '\xE7': + if (('\x64' == aBuf[1]) && ('\x4C' == aBuf[2])) { + // E7 64 4c UTF-1 encoded BOM + mDetectedCharset = "UTF-1"; + mDetectedConfidence = 1.0; + mDetectedIsBOM = 1; + } + break; + case '\xDD': + if (('\x73' == aBuf[1]) && ('\x66' == aBuf[2]) && ('\x73' == aBuf[3])) { + // DD 73 66 73 UTF-EBCDIC encoded BOM + mDetectedCharset = "UTF-EBCDIC"; + mDetectedConfidence = 1.0; + mDetectedIsBOM = 1; + } + break; + case '\x0E': + if (('\xFE' == aBuf[1]) && ('\xFF' == aBuf[2])) { + // 0E FE FF SCSU encoded BOM + mDetectedCharset = "SCSU"; + mDetectedConfidence = 1.0; + mDetectedIsBOM = 1; + } + break; + case '\xFB': + if (('\xEE' == aBuf[1]) && ('\x28' == aBuf[2])) { + // FB EE 28 BOCU-1 encoded BOM + mDetectedCharset = "BOCU-1"; + mDetectedConfidence = 1.0; + mDetectedIsBOM = 1; + } + break; + case '\x84': + if (('\x31' == aBuf[1]) && ('\x95' == aBuf[2]) && ('\x33' == aBuf[3])) { + // 84 31 95 33 GB18030 encoded BOM + mDetectedCharset = "GB18030"; + mDetectedConfidence = 1.0; + mDetectedIsBOM = 1; + } + break; } // switch if (mDetectedCharset) @@ -272,6 +340,7 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen) mDone = PR_TRUE; mDetectedCharset = mNbspFound ? "ISO-8859-1" : "ASCII"; mDetectedConfidence = 1.0; + mDetectedIsBOM = 0; } return NS_OK; } diff --git a/src/nsUniversalDetector.h b/src/nsUniversalDetector.h index df03673..17c556f 100644 --- a/src/nsUniversalDetector.h +++ b/src/nsUniversalDetector.h @@ -1,5 +1,5 @@ /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- - * vim600: et sw=2 ts=2 fdm=marker + * vim: et sw=2 ts=2 fdm=marker */ /* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 @@ -24,6 +24,7 @@ * Contributor(s): * JoungKyun.Kim * - Add mDetectedConfidence + * - Add mDetectedIsBOM * * $Id$ * @@ -86,6 +87,7 @@ class nsUniversalDetector { char mLastChar; const char * mDetectedCharset; float mDetectedConfidence; + short mDetectedIsBOM; PRInt32 mBestGuess; PRUint32 mLanguageFilter; diff --git a/test/Makefile b/test/Makefile index b7a5d5d..a182a27 100644 --- a/test/Makefile +++ b/test/Makefile @@ -9,13 +9,16 @@ CC = gcc CFLAGS = -I../src -I../include LIBS = -L../src/.libs -Wl,-rpath=../src/.libs -lchardet -lstdc++ -all: sample sample1 +all: sample sample1 bom-test -sample: - $(CC) $(DEFS) $(CFLAGS) sample.c $(LIBS) -o sample +sample: sample.c + $(CC) $(DEFS) $(CFLAGS) $< $(LIBS) -o $@ -sample1: - $(CC) $(DEFS) $(CFLAGS) sample1.c $(LIBS) -o sample1 +sample1: sample1.c + $(CC) $(DEFS) $(CFLAGS) $< $(LIBS) -o $@ + +bom-test: bom-test.c + $(CC) $(DEFS) $(CFLAGS) $< $(LIBS) -o $@ clean : - -rm -f sample sample1 + -rm -f sample sample1 bom-test diff --git a/test/bom-test.c b/test/bom-test.c new file mode 100644 index 0000000..7c53ab7 --- /dev/null +++ b/test/bom-test.c @@ -0,0 +1,65 @@ +/* + * sample code with libchardet + * author: JoungKyun.Kim + * $Id$ + */ +#include +//#include "../src/chardet.h" +#include +#include + +#ifdef CHARDET_BINARY_SAFE + #define detect_api(x,y) detect_r(x, strlen(x), y) +#else + #define detect_api(x,y) detect(x, y) +#endif + +int main (void) { + DetectObj *obj; + FILE *fp; + struct stat sb; + char *buf; + char *buf1; + size_t n, i; + char *f[2] = { "utf-8-bom.txt", "utf-8.txt" }; + + for ( i=0; i<2; i++ ) { + stat (f[i], &sb); + + buf = malloc (sizeof (char) * (sb.st_size + 4)); + memset (buf, 0, sb.st_size + 4); + + + fp = fopen (f[i], "r"); + n = fread (buf, sizeof (char), sb.st_size, fp); + fclose (fp); + + obj = detect_obj_init (); + if ( obj == NULL ) { + fprintf (stderr, "On attemped detector, memory allocation failed\n"); + return CHARDET_OUT_OF_MEMORY; + } + + if ( detect_api (buf, &obj) == CHARDET_OUT_OF_MEMORY ) + { + fprintf (stderr, "On handle processing, occured out of memory\n"); + return CHARDET_OUT_OF_MEMORY; + } + + if ( buf[n-1] == '\n' ) { + memset (buf + n - 1, 0, 1); + n--; + } + + if ( obj->bom == 1 ) { + memmove (buf, buf + 4, n - 4); + memset (buf + n - 4, 0, 1); + } + + printf ("## Charset: %s, Confidence: %f, BOM: %d => %s\n", obj->encoding, obj->confidence, obj->bom, buf); + + free (buf); + detect_obj_free (&obj); + } + return 0; +} diff --git a/test/sample.c b/test/sample.c index 82fa43e..c325577 100644 --- a/test/sample.c +++ b/test/sample.c @@ -48,7 +48,7 @@ int main (void) { fprintf (stderr, "On handle processing, occured out of memory\n"); return CHARDET_OUT_OF_MEMORY; } - printf ("## %s : %s : %f\n", str[i], obj->encoding, obj->confidence); + printf ("## %s : %s : %f : %d\n", str[i], obj->encoding, obj->confidence, obj->bom); detect_obj_free (&obj); } diff --git a/test/sample1.c b/test/sample1.c index 301b115..ab5b8e3 100644 --- a/test/sample1.c +++ b/test/sample1.c @@ -39,7 +39,7 @@ int main (void) { fprintf (stderr, "On handle processing, occured out of memory\n"); return CHARDET_OUT_OF_MEMORY; } - printf ("## %s : %s : %f\n", str[i], obj->encoding, obj->confidence); + printf ("## %s : %s : %f : %d\n", str[i], obj->encoding, obj->confidence, obj->bom); detect_obj_free (&obj); } diff --git a/test/utf-8-bom.txt b/test/utf-8-bom.txt new file mode 100644 index 0000000..cf814b9 --- /dev/null +++ b/test/utf-8-bom.txt @@ -0,0 +1 @@ +utf-8 bom 테스트 입니다. bom 을 잘 발견할 수 있을까요? diff --git a/test/utf-8.txt b/test/utf-8.txt new file mode 100644 index 0000000..3ff1657 --- /dev/null +++ b/test/utf-8.txt @@ -0,0 +1 @@ +utf-8 bom 테스트 입니다. bom 이 없는데 구분을 잘 할까요?