forked from mirrors/gecko-dev
Set a minimum threshold even for the detector's preferred language. Bug 547487, r=emk
This commit is contained in:
parent
e38251afc0
commit
c38fa14fa2
14 changed files with 88 additions and 30 deletions
|
|
@ -46,16 +46,13 @@
|
|||
#define SURE_YES 0.99f
|
||||
#define SURE_NO 0.01f
|
||||
|
||||
#define MINIMUM_DATA_THRESHOLD 4
|
||||
|
||||
//return confidence base on received data
|
||||
float CharDistributionAnalysis::GetConfidence(PRBool aIsPreferredLanguage)
|
||||
float CharDistributionAnalysis::GetConfidence(void)
|
||||
{
|
||||
//if we didn't receive any character in our consideration range, or the
|
||||
// number of frequent characters is below the minimum threshold, return
|
||||
// negative answer
|
||||
if (mTotalChars <= 0 ||
|
||||
!aIsPreferredLanguage && mFreqChars <= MINIMUM_DATA_THRESHOLD)
|
||||
if (mTotalChars <= 0 || mFreqChars <= mDataThreshold)
|
||||
return SURE_NO;
|
||||
|
||||
if (mTotalChars != mFreqChars) {
|
||||
|
|
|
|||
|
|
@ -42,10 +42,12 @@
|
|||
|
||||
#define ENOUGH_DATA_THRESHOLD 1024
|
||||
|
||||
#define MINIMUM_DATA_THRESHOLD 4
|
||||
|
||||
class CharDistributionAnalysis
|
||||
{
|
||||
public:
|
||||
CharDistributionAnalysis() {Reset();}
|
||||
CharDistributionAnalysis() {Reset(PR_FALSE);}
|
||||
|
||||
//feed a block of data and do distribution analysis
|
||||
void HandleData(const char* aBuf, PRUint32 aLen) {}
|
||||
|
|
@ -71,14 +73,15 @@ public:
|
|||
}
|
||||
|
||||
//return confidence base on existing data
|
||||
float GetConfidence(PRBool aIsPreferredLanguage);
|
||||
float GetConfidence(void);
|
||||
|
||||
//Reset analyser, clear any state
|
||||
void Reset(void)
|
||||
void Reset(PRBool aIsPreferredLanguage)
|
||||
{
|
||||
mDone = PR_FALSE;
|
||||
mTotalChars = 0;
|
||||
mFreqChars = 0;
|
||||
mDataThreshold = aIsPreferredLanguage ? 0 : MINIMUM_DATA_THRESHOLD;
|
||||
}
|
||||
|
||||
//This function is for future extension. Caller can use this function to control
|
||||
|
|
@ -104,6 +107,9 @@ protected:
|
|||
//Total character encounted.
|
||||
PRUint32 mTotalChars;
|
||||
|
||||
//Number of hi-byte characters needed to trigger detection
|
||||
PRUint32 mDataThreshold;
|
||||
|
||||
//Mapping table to get frequency order from char order (get from GetOrder())
|
||||
const PRInt16 *mCharToFreqOrder;
|
||||
|
||||
|
|
|
|||
|
|
@ -170,7 +170,7 @@ void JapaneseContextAnalysis::HandleData(const char* aBuf, PRUint32 aLen)
|
|||
return;
|
||||
}
|
||||
|
||||
void JapaneseContextAnalysis::Reset(void)
|
||||
void JapaneseContextAnalysis::Reset(PRBool aIsPreferredLanguage)
|
||||
{
|
||||
mTotalRel = 0;
|
||||
for (PRUint32 i = 0; i < NUM_OF_CATEGORY; i++)
|
||||
|
|
@ -178,13 +178,14 @@ void JapaneseContextAnalysis::Reset(void)
|
|||
mNeedToSkipCharNum = 0;
|
||||
mLastCharOrder = -1;
|
||||
mDone = PR_FALSE;
|
||||
mDataThreshold = aIsPreferredLanguage ? 0 : MINIMUM_DATA_THRESHOLD;
|
||||
}
|
||||
#define DONT_KNOW (float)-1
|
||||
|
||||
float JapaneseContextAnalysis::GetConfidence(PRBool aIsPreferredLanguage)
|
||||
float JapaneseContextAnalysis::GetConfidence(void)
|
||||
{
|
||||
//This is just one way to calculate confidence. It works well for me.
|
||||
if (aIsPreferredLanguage || mTotalRel > MINIMUM_DATA_THRESHOLD)
|
||||
if (mTotalRel > mDataThreshold)
|
||||
return ((float)(mTotalRel - mRelSample[0]))/mTotalRel;
|
||||
else
|
||||
return (float)DONT_KNOW;
|
||||
|
|
|
|||
|
|
@ -51,7 +51,7 @@ extern const PRUint8 jp2CharContext[83][83];
|
|||
class JapaneseContextAnalysis
|
||||
{
|
||||
public:
|
||||
JapaneseContextAnalysis() {Reset();}
|
||||
JapaneseContextAnalysis() {Reset(PR_FALSE);}
|
||||
|
||||
void HandleData(const char* aBuf, PRUint32 aLen);
|
||||
|
||||
|
|
@ -74,8 +74,8 @@ public:
|
|||
mLastCharOrder = order;
|
||||
}
|
||||
|
||||
float GetConfidence(PRBool aIsPreferredLanguage);
|
||||
void Reset(void);
|
||||
float GetConfidence(void);
|
||||
void Reset(PRBool aIsPreferredLanguage);
|
||||
void SetOpion(){}
|
||||
PRBool GotEnoughData() {return mTotalRel > ENOUGH_REL_THRESHOLD;}
|
||||
|
||||
|
|
@ -89,6 +89,9 @@ protected:
|
|||
//total sequence received
|
||||
PRUint32 mTotalRel;
|
||||
|
||||
//Number of sequences needed to trigger detection
|
||||
PRUint32 mDataThreshold;
|
||||
|
||||
//The order of previous char
|
||||
PRInt32 mLastCharOrder;
|
||||
|
||||
|
|
|
|||
|
|
@ -41,7 +41,7 @@ void nsBig5Prober::Reset(void)
|
|||
{
|
||||
mCodingSM->Reset();
|
||||
mState = eDetecting;
|
||||
mDistributionAnalyser.Reset();
|
||||
mDistributionAnalyser.Reset(mIsPreferredLanguage);
|
||||
}
|
||||
|
||||
nsProbingState nsBig5Prober::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
|
|
@ -81,7 +81,7 @@ nsProbingState nsBig5Prober::HandleData(const char* aBuf, PRUint32 aLen)
|
|||
|
||||
float nsBig5Prober::GetConfidence(void)
|
||||
{
|
||||
float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage);
|
||||
float distribCf = mDistributionAnalyser.GetConfidence();
|
||||
|
||||
return (float)distribCf;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -46,8 +46,8 @@ void nsEUCJPProber::Reset(void)
|
|||
{
|
||||
mCodingSM->Reset();
|
||||
mState = eDetecting;
|
||||
mContextAnalyser.Reset();
|
||||
mDistributionAnalyser.Reset();
|
||||
mContextAnalyser.Reset(mIsPreferredLanguage);
|
||||
mDistributionAnalyser.Reset(mIsPreferredLanguage);
|
||||
}
|
||||
|
||||
nsProbingState nsEUCJPProber::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
|
|
@ -91,8 +91,8 @@ nsProbingState nsEUCJPProber::HandleData(const char* aBuf, PRUint32 aLen)
|
|||
|
||||
float nsEUCJPProber::GetConfidence(void)
|
||||
{
|
||||
float contxtCf = mContextAnalyser.GetConfidence(mIsPreferredLanguage);
|
||||
float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage);
|
||||
float contxtCf = mContextAnalyser.GetConfidence();
|
||||
float distribCf = mDistributionAnalyser.GetConfidence();
|
||||
|
||||
return (contxtCf > distribCf ? contxtCf : distribCf);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -41,7 +41,7 @@ void nsEUCKRProber::Reset(void)
|
|||
{
|
||||
mCodingSM->Reset();
|
||||
mState = eDetecting;
|
||||
mDistributionAnalyser.Reset();
|
||||
mDistributionAnalyser.Reset(mIsPreferredLanguage);
|
||||
//mContextAnalyser.Reset();
|
||||
}
|
||||
|
||||
|
|
@ -84,7 +84,7 @@ nsProbingState nsEUCKRProber::HandleData(const char* aBuf, PRUint32 aLen)
|
|||
|
||||
float nsEUCKRProber::GetConfidence(void)
|
||||
{
|
||||
float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage);
|
||||
float distribCf = mDistributionAnalyser.GetConfidence();
|
||||
|
||||
return (float)distribCf;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -41,7 +41,7 @@ void nsEUCTWProber::Reset(void)
|
|||
{
|
||||
mCodingSM->Reset();
|
||||
mState = eDetecting;
|
||||
mDistributionAnalyser.Reset();
|
||||
mDistributionAnalyser.Reset(mIsPreferredLanguage);
|
||||
//mContextAnalyser.Reset();
|
||||
}
|
||||
|
||||
|
|
@ -84,7 +84,7 @@ nsProbingState nsEUCTWProber::HandleData(const char* aBuf, PRUint32 aLen)
|
|||
|
||||
float nsEUCTWProber::GetConfidence(void)
|
||||
{
|
||||
float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage);
|
||||
float distribCf = mDistributionAnalyser.GetConfidence();
|
||||
|
||||
return (float)distribCf;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -46,7 +46,7 @@ void nsGB18030Prober::Reset(void)
|
|||
{
|
||||
mCodingSM->Reset();
|
||||
mState = eDetecting;
|
||||
mDistributionAnalyser.Reset();
|
||||
mDistributionAnalyser.Reset(mIsPreferredLanguage);
|
||||
//mContextAnalyser.Reset();
|
||||
}
|
||||
|
||||
|
|
@ -89,7 +89,7 @@ nsProbingState nsGB18030Prober::HandleData(const char* aBuf, PRUint32 aLen)
|
|||
|
||||
float nsGB18030Prober::GetConfidence(void)
|
||||
{
|
||||
float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage);
|
||||
float distribCf = mDistributionAnalyser.GetConfidence();
|
||||
|
||||
return (float)distribCf;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -46,8 +46,8 @@ void nsSJISProber::Reset(void)
|
|||
{
|
||||
mCodingSM->Reset();
|
||||
mState = eDetecting;
|
||||
mContextAnalyser.Reset();
|
||||
mDistributionAnalyser.Reset();
|
||||
mContextAnalyser.Reset(mIsPreferredLanguage);
|
||||
mDistributionAnalyser.Reset(mIsPreferredLanguage);
|
||||
}
|
||||
|
||||
nsProbingState nsSJISProber::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
|
|
@ -90,8 +90,8 @@ nsProbingState nsSJISProber::HandleData(const char* aBuf, PRUint32 aLen)
|
|||
|
||||
float nsSJISProber::GetConfidence(void)
|
||||
{
|
||||
float contxtCf = mContextAnalyser.GetConfidence(mIsPreferredLanguage);
|
||||
float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage);
|
||||
float contxtCf = mContextAnalyser.GetConfidence();
|
||||
float distribCf = mDistributionAnalyser.GetConfidence();
|
||||
|
||||
return (contxtCf > distribCf ? contxtCf : distribCf);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -32,6 +32,16 @@ function InitDetectorTests()
|
|||
SetDetectorPref(gDetectorList[0]);
|
||||
gTestIndex = 0;
|
||||
$("testframe").onload = DoDetectionTest;
|
||||
|
||||
if (gExpectedCharset == "default") {
|
||||
try {
|
||||
gExpectedCharset = prefService
|
||||
.getComplexValue("intl.charset.default",
|
||||
Components.interfaces.nsIPrefLocalizedString).data;
|
||||
} catch (e) {
|
||||
gExpectedCharset = "ISO-8859-8";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function SetDetectorPref(aPrefValue)
|
||||
|
|
|
|||
|
|
@ -66,6 +66,8 @@ _TEST_FILES = \
|
|||
test_bug431054-japanese.html \
|
||||
bug488426_text.html \
|
||||
test_bug488426.html \
|
||||
bug547487_text.html \
|
||||
test_bug547487.html \
|
||||
$(NULL)
|
||||
|
||||
libs:: $(_TEST_FILES)
|
||||
|
|
|
|||
1
extensions/universalchardet/tests/bug547487_text.html
Normal file
1
extensions/universalchardet/tests/bug547487_text.html
Normal file
|
|
@ -0,0 +1 @@
|
|||
The quick brown fox jumps over the lazy dog.
|
||||
38
extensions/universalchardet/tests/test_bug547487.html
Normal file
38
extensions/universalchardet/tests/test_bug547487.html
Normal file
|
|
@ -0,0 +1,38 @@
|
|||
<!DOCTYPE HTML>
|
||||
<html>
|
||||
<!--
|
||||
https://bugzilla.mozilla.org/show_bug.cgi?id=547487
|
||||
-->
|
||||
<head>
|
||||
<title>Test for Bug 547487</title>
|
||||
<script type="text/javascript"
|
||||
src="chrome://mochikit/content/MochiKit/packed.js"></script>
|
||||
<script type="text/javascript"
|
||||
src="chrome://mochikit/content/tests/SimpleTest/SimpleTest.js">
|
||||
</script>
|
||||
<script type="text/javascript" src="CharsetDetectionTests.js"></script>
|
||||
<link rel="stylesheet" type="text/css"
|
||||
href="chrome://mochikit/content/tests/SimpleTest/test.css" />
|
||||
</head>
|
||||
<body>
|
||||
<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=547487">Mozilla Bug 547487</a>
|
||||
<p id="display"></p>
|
||||
<div id="content" style="display: none">
|
||||
</div>
|
||||
<iframe id="testframe"></iframe>
|
||||
<pre id="test">
|
||||
<script class="testbody" type="text/javascript">
|
||||
/** Test for Bug 547487 **/
|
||||
CharsetDetectionTests("bug547487_text.html",
|
||||
"default",
|
||||
new Array("zhtw_parallel_state_machine",
|
||||
"zhcn_parallel_state_machine",
|
||||
"ja_parallel_state_machine",
|
||||
"ko_parallel_state_machine",
|
||||
"zh_parallel_state_machine",
|
||||
"cjk_parallel_state_machine",
|
||||
"universal_charset_detector"));
|
||||
</script>
|
||||
</pre>
|
||||
</body>
|
||||
</html>
|
||||
Loading…
Reference in a new issue