Skip to content

Commit

Permalink
add uchardet_get_confidence func
Browse files Browse the repository at this point in the history
  • Loading branch information
PyYoshi committed Mar 27, 2017
1 parent b7707d4 commit f1e11d6
Show file tree
Hide file tree
Showing 7 changed files with 47 additions and 16 deletions.
10 changes: 5 additions & 5 deletions src/nsSBCSGroupProber.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -128,19 +128,19 @@ nsSBCSGroupProber::nsSBCSGroupProber()

mProbers[46] = new nsSingleByteCharSetProber(&Windows_1250CzechModel);
mProbers[47] = new nsSingleByteCharSetProber(&Iso_8859_2CzechModel);
mProbers[48] = new nsSingleByteCharSetProber(&Mac_CentraleuropeCzechModel);
mProbers[48] = new nsSingleByteCharSetProber(&MaccentraleuropeCzechModel);
mProbers[49] = new nsSingleByteCharSetProber(&Ibm852CzechModel);

mProbers[50] = new nsSingleByteCharSetProber(&Windows_1250SlovakModel);
mProbers[51] = new nsSingleByteCharSetProber(&Iso_8859_2SlovakModel);
mProbers[52] = new nsSingleByteCharSetProber(&Mac_CentraleuropeSlovakModel);
mProbers[52] = new nsSingleByteCharSetProber(&MaccentraleuropeSlovakModel);
mProbers[53] = new nsSingleByteCharSetProber(&Ibm852SlovakModel);

mProbers[54] = new nsSingleByteCharSetProber(&Windows_1250PolishModel);
mProbers[55] = new nsSingleByteCharSetProber(&Iso_8859_2PolishModel);
mProbers[56] = new nsSingleByteCharSetProber(&Iso_8859_13PolishModel);
mProbers[57] = new nsSingleByteCharSetProber(&Iso_8859_16PolishModel);
mProbers[58] = new nsSingleByteCharSetProber(&Mac_CentraleuropePolishModel);
mProbers[58] = new nsSingleByteCharSetProber(&MaccentraleuropePolishModel);
mProbers[59] = new nsSingleByteCharSetProber(&Ibm852PolishModel);

mProbers[60] = new nsSingleByteCharSetProber(&Iso_8859_1FinnishModel);
Expand All @@ -160,7 +160,7 @@ nsSBCSGroupProber::nsSBCSGroupProber()
mProbers[71] = new nsSingleByteCharSetProber(&Iso_8859_2CroatianModel);
mProbers[72] = new nsSingleByteCharSetProber(&Iso_8859_13CroatianModel);
mProbers[73] = new nsSingleByteCharSetProber(&Iso_8859_16CroatianModel);
mProbers[74] = new nsSingleByteCharSetProber(&Mac_CentraleuropeCroatianModel);
mProbers[74] = new nsSingleByteCharSetProber(&MaccentraleuropeCroatianModel);
mProbers[75] = new nsSingleByteCharSetProber(&Ibm852CroatianModel);

mProbers[76] = new nsSingleByteCharSetProber(&Windows_1252EstonianModel);
Expand All @@ -182,7 +182,7 @@ nsSBCSGroupProber::nsSBCSGroupProber()
mProbers[89] = new nsSingleByteCharSetProber(&Windows_1250SloveneModel);
mProbers[90] = new nsSingleByteCharSetProber(&Iso_8859_2SloveneModel);
mProbers[91] = new nsSingleByteCharSetProber(&Iso_8859_16SloveneModel);
mProbers[92] = new nsSingleByteCharSetProber(&Mac_CentraleuropeSloveneModel);
mProbers[92] = new nsSingleByteCharSetProber(&MaccentraleuropeSloveneModel);
mProbers[93] = new nsSingleByteCharSetProber(&Ibm852SloveneModel);

mProbers[94] = new nsSingleByteCharSetProber(&Iso_8859_1SwedishModel);
Expand Down
10 changes: 5 additions & 5 deletions src/nsSBCharSetProber.h
Original file line number Diff line number Diff line change
Expand Up @@ -190,19 +190,19 @@ extern const SequenceModel Iso_8859_3MalteseModel;
extern const SequenceModel Windows_1250CzechModel;
extern const SequenceModel Iso_8859_2CzechModel;
extern const SequenceModel Ibm852CzechModel;
extern const SequenceModel Mac_CentraleuropeCzechModel;
extern const SequenceModel MaccentraleuropeCzechModel;

extern const SequenceModel Windows_1250SlovakModel;
extern const SequenceModel Iso_8859_2SlovakModel;
extern const SequenceModel Ibm852SlovakModel;
extern const SequenceModel Mac_CentraleuropeSlovakModel;
extern const SequenceModel MaccentraleuropeSlovakModel;

extern const SequenceModel Windows_1250PolishModel;
extern const SequenceModel Iso_8859_2PolishModel;
extern const SequenceModel Iso_8859_13PolishModel;
extern const SequenceModel Iso_8859_16PolishModel;
extern const SequenceModel Ibm852PolishModel;
extern const SequenceModel Mac_CentraleuropePolishModel;
extern const SequenceModel MaccentraleuropePolishModel;

extern const SequenceModel Iso_8859_1FinnishModel;
extern const SequenceModel Iso_8859_4FinnishModel;
Expand All @@ -222,7 +222,7 @@ extern const SequenceModel Iso_8859_2CroatianModel;
extern const SequenceModel Iso_8859_13CroatianModel;
extern const SequenceModel Iso_8859_16CroatianModel;
extern const SequenceModel Ibm852CroatianModel;
extern const SequenceModel Mac_CentraleuropeCroatianModel;
extern const SequenceModel MaccentraleuropeCroatianModel;

extern const SequenceModel Windows_1252EstonianModel;
extern const SequenceModel Windows_1257EstonianModel;
Expand All @@ -244,7 +244,7 @@ extern const SequenceModel Windows_1250SloveneModel;
extern const SequenceModel Iso_8859_2SloveneModel;
extern const SequenceModel Iso_8859_16SloveneModel;
extern const SequenceModel Ibm852SloveneModel;
extern const SequenceModel Mac_CentraleuropeSloveneModel;
extern const SequenceModel MaccentraleuropeSloveneModel;

extern const SequenceModel Iso_8859_1SwedishModel;
extern const SequenceModel Iso_8859_4SwedishModel;
Expand Down
17 changes: 15 additions & 2 deletions src/nsUniversalDetector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ nsUniversalDetector::nsUniversalDetector(PRUint32 aLanguageFilter)

mStart = PR_TRUE;
mDetectedCharset = nsnull;
mDetectedConfidence = 0.0;
mGotData = PR_FALSE;
mInputState = ePureAscii;
mLastChar = '\0';
Expand Down Expand Up @@ -83,6 +84,7 @@ nsUniversalDetector::Reset()

mStart = PR_TRUE;
mDetectedCharset = nsnull;
mDetectedConfidence = 0.0;
mGotData = PR_FALSE;
mInputState = ePureAscii;
mLastChar = '\0';
Expand Down Expand Up @@ -120,11 +122,13 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
if (('\xBB' == aBuf[1]) && ('\xBF' == aBuf[2]))
/* EF BB BF: UTF-8 encoded BOM. */
mDetectedCharset = "UTF-8";
mDetectedConfidence = 0.99;
break;
case '\xFE':
if ('\xFF' == aBuf[1])
/* FE FF: UTF-16, big endian BOM. */
mDetectedCharset = "UTF-16";
mDetectedConfidence = 0.99;
break;
case '\xFF':
if ('\xFE' == aBuf[1])
Expand All @@ -135,11 +139,13 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
{
/* FF FE 00 00: UTF-32 (LE). */
mDetectedCharset = "UTF-32";
mDetectedConfidence = 0.99;
}
else
{
/* FF FE: UTF-16, little endian BOM. */
mDetectedCharset = "UTF-16";
mDetectedConfidence = 0.99;
}
}
break;
Expand All @@ -151,6 +157,7 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
{
/* 00 00 FE FF: UTF-32 (BE). */
mDetectedCharset = "UTF-32";
mDetectedConfidence = 0.99;
}
break;
}
Expand Down Expand Up @@ -241,16 +248,19 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
{
mDone = PR_TRUE;
mDetectedCharset = mEscCharSetProber->GetCharSetName();
mDetectedConfidence = mEscCharSetProber->GetConfidence();
}
else if (mNbspFound)
{
mDetectedCharset = "ISO-8859-1";
mDetectedConfidence = 1.0;
}
else
{
/* ASCII with the ESC character (or the sequence "~{") is still
* ASCII until proven otherwise. */
mDetectedCharset = "ASCII";
mDetectedConfidence = 1.0;
}
break;
case eHighbyte:
Expand All @@ -263,6 +273,7 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
{
mDone = PR_TRUE;
mDetectedCharset = mCharSetProbers[i]->GetCharSetName();
mDetectedConfidence = mCharSetProbers[i]->GetConfidence();
return NS_OK;
}
}
Expand All @@ -275,11 +286,13 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
/* ISO-8859-1 is a good result candidate for ASCII + NBSP.
* (though it could have been any ISO-8859 encoding). */
mDetectedCharset = "ISO-8859-1";
mDetectedConfidence = 1.0;
}
else
{
/* Pure ASCII */
mDetectedCharset = "ASCII";
mDetectedConfidence = 1.0;
}
break;
}
Expand All @@ -300,7 +313,7 @@ void nsUniversalDetector::DataEnd()
if (mDetectedCharset)
{
mDone = PR_TRUE;
Report(mDetectedCharset);
Report(mDetectedCharset, mDetectedConfidence);
return;
}

Expand All @@ -326,7 +339,7 @@ void nsUniversalDetector::DataEnd()
}
//do not report anything because we are not confident of it, that's in fact a negative answer
if (maxProberConfidence > MINIMUM_THRESHOLD)
Report(mCharSetProbers[maxProber]->GetCharSetName());
Report(mCharSetProbers[maxProber]->GetCharSetName(), mCharSetProbers[maxProber]->GetConfidence());
}
break;
case eEscAscii:
Expand Down
3 changes: 2 additions & 1 deletion src/nsUniversalDetector.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ class nsUniversalDetector {
virtual void DataEnd(void);

protected:
virtual void Report(const char* aCharset) = 0;
virtual void Report(const char* aCharset, float aConfidence) = 0;
virtual void Reset();
nsInputState mInputState;
PRBool mNbspFound;
Expand All @@ -79,6 +79,7 @@ class nsUniversalDetector {
PRBool mGotData;
char mLastChar;
const char * mDetectedCharset;
float mDetectedConfidence;
PRInt32 mBestGuess;
PRUint32 mLanguageFilter;

Expand Down
1 change: 1 addition & 0 deletions src/symbols.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ set(
uchardet_data_end
uchardet_reset
uchardet_get_charset
uchardet_get_confidence
)

set (LINK_FLAGS "")
Expand Down
20 changes: 17 additions & 3 deletions src/uchardet.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,25 +44,29 @@ class HandleUniversalDetector : public nsUniversalDetector
{
protected:
char *m_charset;

float m_confidence;
public:
HandleUniversalDetector()
: nsUniversalDetector(NS_FILTER_ALL)
, m_charset(0)
{
m_confidence = 0.0;
}

virtual ~HandleUniversalDetector()
{
if (m_charset)
if (m_charset) {
free(m_charset);
m_confidence = 0.0;
}
}

virtual void Report(const char* charset)
virtual void Report(const char* charset, float confidence)
{
if (m_charset)
free(m_charset);
m_charset = strdup(charset);
m_confidence = confidence;
}

virtual void Reset()
Expand All @@ -71,12 +75,17 @@ class HandleUniversalDetector : public nsUniversalDetector
if (m_charset)
free(m_charset);
m_charset = strdup("");
m_confidence = 0.0;
}

const char* GetCharset() const
{
return m_charset? m_charset : "";
}

float GetConfidence() {
return m_confidence;
}
};

uchardet_t uchardet_new(void)
Expand Down Expand Up @@ -109,3 +118,8 @@ const char* uchardet_get_charset(uchardet_t ud)
{
return reinterpret_cast<HandleUniversalDetector*>(ud)->GetCharset();
}

float uchardet_get_confidence(uchardet_t ud)
{
return reinterpret_cast<HandleUniversalDetector*>(ud)->GetConfidence();
}
2 changes: 2 additions & 0 deletions src/uchardet.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,8 @@ void uchardet_reset(uchardet_t ud);
*/
const char * uchardet_get_charset(uchardet_t ud);

float uchardet_get_confidence(uchardet_t ud);

#ifdef __cplusplus
}
#endif
Expand Down

0 comments on commit f1e11d6

Please sign in to comment.