forked from mirrors/gecko-dev
		
	
		
			
				
	
	
		
			1334 lines
		
	
	
	
		
			50 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			1334 lines
		
	
	
	
		
			50 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
| // Copyright 2013 Google Inc. All Rights Reserved.
 | |
| //
 | |
| // Licensed under the Apache License, Version 2.0 (the "License");
 | |
| // you may not use this file except in compliance with the License.
 | |
| // You may obtain a copy of the License at
 | |
| //
 | |
| //     http://www.apache.org/licenses/LICENSE-2.0
 | |
| //
 | |
| // Unless required by applicable law or agreed to in writing, software
 | |
| // distributed under the License is distributed on an "AS IS" BASIS,
 | |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | |
| // See the License for the specific language governing permissions and
 | |
| // limitations under the License.
 | |
| 
 | |
| //
 | |
| // Author: dsites@google.com (Dick Sites)
 | |
| // Updated 2014.01 for dual table lookup
 | |
| //
 | |
| 
 | |
| #include "scoreonescriptspan.h"
 | |
| 
 | |
| #include "cldutil.h"
 | |
| #include "debug.h"
 | |
| #include "lang_script.h"
 | |
| 
 | |
| #include <stdio.h>
 | |
| 
 | |
| using namespace std;
 | |
| 
 | |
| namespace CLD2 {
 | |
| 
 | |
| static const int kUnreliablePercentThreshold = 75;
 | |
| 
 | |
| void AddLangProb(uint32 langprob, Tote* chunk_tote) {
 | |
|   ProcessProbV2Tote(langprob, chunk_tote);
 | |
| }
 | |
| 
 | |
| void ZeroPSLang(uint32 langprob, Tote* chunk_tote) {
 | |
|   uint8 top1 = (langprob >> 8) & 0xff;
 | |
|   chunk_tote->SetScore(top1, 0);
 | |
| }
 | |
| 
 | |
| bool SameCloseSet(uint16 lang1, uint16 lang2) {
 | |
|   int lang1_close_set = LanguageCloseSet(static_cast<Language>(lang1));
 | |
|   if (lang1_close_set == 0) {return false;}
 | |
|   int lang2_close_set = LanguageCloseSet(static_cast<Language>(lang2));
 | |
|   return (lang1_close_set == lang2_close_set);
 | |
| }
 | |
| 
 | |
| bool SameCloseSet(Language lang1, Language lang2) {
 | |
|   int lang1_close_set = LanguageCloseSet(lang1);
 | |
|   if (lang1_close_set == 0) {return false;}
 | |
|   int lang2_close_set = LanguageCloseSet(lang2);
 | |
|   return (lang1_close_set == lang2_close_set);
 | |
| }
 | |
| 
 | |
| 
 | |
| // Needs expected score per 1KB in scoring context
 | |
| void SetChunkSummary(ULScript ulscript, int first_linear_in_chunk,
 | |
|                      int offset, int len,
 | |
|                      const ScoringContext* scoringcontext,
 | |
|                      const Tote* chunk_tote,
 | |
|                      ChunkSummary* chunksummary) {
 | |
|   int key3[3];
 | |
|   chunk_tote->CurrentTopThreeKeys(key3);
 | |
|   Language lang1 = FromPerScriptNumber(ulscript, key3[0]);
 | |
|   Language lang2 = FromPerScriptNumber(ulscript, key3[1]);
 | |
| 
 | |
|   int actual_score_per_kb = 0;
 | |
|   if (len > 0) {
 | |
|     actual_score_per_kb = (chunk_tote->GetScore(key3[0]) << 10) / len;
 | |
|   }
 | |
|   int expected_subscr = lang1 * 4 + LScript4(ulscript);
 | |
|   int expected_score_per_kb =
 | |
|      scoringcontext->scoringtables->kExpectedScore[expected_subscr];
 | |
| 
 | |
|   chunksummary->offset = offset;
 | |
|   chunksummary->chunk_start = first_linear_in_chunk;
 | |
|   chunksummary->lang1 = lang1;
 | |
|   chunksummary->lang2 = lang2;
 | |
|   chunksummary->score1 = chunk_tote->GetScore(key3[0]);
 | |
|   chunksummary->score2 = chunk_tote->GetScore(key3[1]);
 | |
|   chunksummary->bytes = len;
 | |
|   chunksummary->grams = chunk_tote->GetScoreCount();
 | |
|   chunksummary->ulscript = ulscript;
 | |
|   chunksummary->reliability_delta = ReliabilityDelta(chunksummary->score1,
 | |
|                                                      chunksummary->score2,
 | |
|                                                      chunksummary->grams);
 | |
|   // If lang1/lang2 in same close set, set delta reliability to 100%
 | |
|   if (SameCloseSet(lang1, lang2)) {
 | |
|     chunksummary->reliability_delta = 100;
 | |
|   }
 | |
|   chunksummary->reliability_score =
 | |
|      ReliabilityExpected(actual_score_per_kb, expected_score_per_kb);
 | |
| }
 | |
| 
 | |
| // Return true if just lang1 is there: lang2=0 and lang3=0
 | |
| bool IsSingleLang(uint32 langprob) {
 | |
|   // Probably a bug -- which end is lang1? But only used to call empty Boost1
 | |
|   return ((langprob & 0x00ffff00) == 0);
 | |
| }
 | |
| 
 | |
| // Update scoring context distinct_boost for single language quad
 | |
| void AddDistinctBoost1(uint32 langprob, ScoringContext* scoringcontext) {
 | |
|   // Probably keep this empty -- not a good enough signal
 | |
| }
 | |
| 
 | |
| // Update scoring context distinct_boost for distinct octagram
 | |
| // Keep last 4 used. Since these are mostly (except at splices) in
 | |
| // hitbuffer, we might be able to just use a subscript and splice
 | |
| void AddDistinctBoost2(uint32 langprob, ScoringContext* scoringcontext) {
 | |
| // this is called 0..n times per chunk with decoded hitbuffer->distinct...
 | |
|   LangBoosts* distinct_boost = &scoringcontext->distinct_boost.latn;
 | |
|   if (scoringcontext->ulscript != ULScript_Latin) {
 | |
|     distinct_boost = &scoringcontext->distinct_boost.othr;
 | |
|   }
 | |
|   int n = distinct_boost->n;
 | |
|   distinct_boost->langprob[n] = langprob;
 | |
|   distinct_boost->n = distinct_boost->wrap(n + 1);
 | |
| }
 | |
| 
 | |
| // For each chunk, add extra weight for language priors (from content-lang and
 | |
| // meta lang=xx) and distinctive tokens
 | |
| void ScoreBoosts(const ScoringContext* scoringcontext, Tote* chunk_tote) {
 | |
|   // Get boosts for current script
 | |
|   const LangBoosts* langprior_boost = &scoringcontext->langprior_boost.latn;
 | |
|   const LangBoosts* langprior_whack = &scoringcontext->langprior_whack.latn;
 | |
|   const LangBoosts* distinct_boost = &scoringcontext->distinct_boost.latn;
 | |
|   if (scoringcontext->ulscript != ULScript_Latin) {
 | |
|     langprior_boost = &scoringcontext->langprior_boost.othr;
 | |
|     langprior_whack = &scoringcontext->langprior_whack.othr;
 | |
|     distinct_boost = &scoringcontext->distinct_boost.othr;
 | |
|   }
 | |
| 
 | |
|   for (int k = 0; k < kMaxBoosts; ++k) {
 | |
|     uint32 langprob = langprior_boost->langprob[k];
 | |
|     if (langprob > 0) {AddLangProb(langprob, chunk_tote);}
 | |
|   }
 | |
|   for (int k = 0; k < kMaxBoosts; ++k) {
 | |
|     uint32 langprob = distinct_boost->langprob[k];
 | |
|     if (langprob > 0) {AddLangProb(langprob, chunk_tote);}
 | |
|   }
 | |
|   // boost has a packed set of per-script langs and probabilites
 | |
|   // whack has a packed set of per-script lang to be suppressed (zeroed)
 | |
|   // When a language in a close set is given as an explicit hint, others in
 | |
|   //  that set will be whacked here.
 | |
|   for (int k = 0; k < kMaxBoosts; ++k) {
 | |
|     uint32 langprob = langprior_whack->langprob[k];
 | |
|     if (langprob > 0) {ZeroPSLang(langprob, chunk_tote);}
 | |
|   }
 | |
| }
 | |
| 
 | |
| 
 | |
| 
 | |
| // At this point, The chunk is described by
 | |
| //  hitbuffer->base[cspan->chunk_base .. cspan->chunk_base + cspan->base_len)
 | |
| //  hitbuffer->delta[cspan->chunk_delta ... )
 | |
| //  hitbuffer->distinct[cspan->chunk_distinct ... )
 | |
| // Scored text is in text[lo..hi) where
 | |
| //  lo is 0 or the min of first base/delta/distinct hitbuffer offset and
 | |
| //  hi is the min of next base/delta/distinct hitbuffer offset after
 | |
| //  base_len, etc.
 | |
| void GetTextSpanOffsets(const ScoringHitBuffer* hitbuffer,
 | |
|                         const ChunkSpan* cspan, int* lo, int* hi) {
 | |
|   // Front of this span
 | |
|   int lo_base = hitbuffer->base[cspan->chunk_base].offset;
 | |
|   int lo_delta = hitbuffer->delta[cspan->chunk_delta].offset;
 | |
|   int lo_distinct = hitbuffer->distinct[cspan->chunk_distinct].offset;
 | |
|   // Front of next span
 | |
|   int hi_base = hitbuffer->base[cspan->chunk_base +
 | |
|     cspan->base_len].offset;
 | |
|   int hi_delta = hitbuffer->delta[cspan->chunk_delta +
 | |
|     cspan->delta_len].offset;
 | |
|   int hi_distinct = hitbuffer->distinct[cspan->chunk_distinct +
 | |
|     cspan->distinct_len].offset;
 | |
| 
 | |
|   *lo = 0;
 | |
| //  if (cspan->chunk_base > 0) {
 | |
| //    *lo = minint(minint(lo_base, lo_delta), lo_distinct);
 | |
| //  }
 | |
|   *lo = minint(minint(lo_base, lo_delta), lo_distinct);
 | |
|   *hi = minint(minint(hi_base, hi_delta), hi_distinct);
 | |
| }
 | |
| 
 | |
| 
 | |
| int DiffScore(const CLD2TableSummary* obj, int indirect,
 | |
|               uint16 lang1, uint16 lang2) {
 | |
|   if (indirect < static_cast<int>(obj->kCLDTableSizeOne)) {
 | |
|     // Up to three languages at indirect
 | |
|     uint32 langprob = obj->kCLDTableInd[indirect];
 | |
|     return GetLangScore(langprob, lang1) - GetLangScore(langprob, lang2);
 | |
|   } else {
 | |
|     // Up to six languages at start + 2 * (indirect - start)
 | |
|     indirect += (indirect - obj->kCLDTableSizeOne);
 | |
|     uint32 langprob = obj->kCLDTableInd[indirect];
 | |
|     uint32 langprob2 = obj->kCLDTableInd[indirect + 1];
 | |
|     return (GetLangScore(langprob, lang1) + GetLangScore(langprob2, lang1)) -
 | |
|       (GetLangScore(langprob, lang2) + GetLangScore(langprob2, lang2));
 | |
|   }
 | |
| 
 | |
| }
 | |
| 
 | |
| // Score all the bases, deltas, distincts, boosts for one chunk into chunk_tote
 | |
| // After last chunk there is always a hitbuffer entry with an offset just off
 | |
| // the end of the text.
 | |
| // Sets delta_len, and distinct_len
 | |
| void ScoreOneChunk(const char* text, ULScript ulscript,
 | |
|                    const ScoringHitBuffer* hitbuffer,
 | |
|                    int chunk_i,
 | |
|                    ScoringContext* scoringcontext,
 | |
|                    ChunkSpan* cspan, Tote* chunk_tote,
 | |
|                    ChunkSummary* chunksummary) {
 | |
|   int first_linear_in_chunk = hitbuffer->chunk_start[chunk_i];
 | |
|   int first_linear_in_next_chunk = hitbuffer->chunk_start[chunk_i + 1];
 | |
| 
 | |
|   chunk_tote->Reinit();
 | |
|   cspan->delta_len = 0;
 | |
|   cspan->distinct_len = 0;
 | |
|   if (scoringcontext->flags_cld2_verbose) {
 | |
|     fprintf(scoringcontext->debug_file, "<br>ScoreOneChunk[%d..%d) ",
 | |
|             first_linear_in_chunk, first_linear_in_next_chunk);
 | |
|   }
 | |
| 
 | |
|   // 2013.02.05 linear design: just use base and base_len for the span
 | |
|   cspan->chunk_base = first_linear_in_chunk;
 | |
|   cspan->base_len = first_linear_in_next_chunk - first_linear_in_chunk;
 | |
|   for (int i = first_linear_in_chunk; i < first_linear_in_next_chunk; ++i) {
 | |
|     uint32 langprob = hitbuffer->linear[i].langprob;
 | |
|     AddLangProb(langprob, chunk_tote);
 | |
|     if (hitbuffer->linear[i].type <= QUADHIT) {
 | |
|       chunk_tote->AddScoreCount();      // Just count quads, not octas
 | |
|     }
 | |
|     if (hitbuffer->linear[i].type == DISTINCTHIT) {
 | |
|       AddDistinctBoost2(langprob, scoringcontext);
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   // Score language prior boosts
 | |
|   // Score distinct word boost
 | |
|   ScoreBoosts(scoringcontext, chunk_tote);
 | |
| 
 | |
|   int lo = hitbuffer->linear[first_linear_in_chunk].offset;
 | |
|   int hi = hitbuffer->linear[first_linear_in_next_chunk].offset;
 | |
| 
 | |
|   // Chunk_tote: get top langs, scores, etc. and fill in chunk summary
 | |
|   SetChunkSummary(ulscript, first_linear_in_chunk, lo, hi - lo,
 | |
|                   scoringcontext, chunk_tote, chunksummary);
 | |
| 
 | |
|   bool more_to_come = false;
 | |
|   bool score_cjk = false;
 | |
|   if (scoringcontext->flags_cld2_html) {
 | |
|     // Show one chunk in readable output
 | |
|     CLD2_Debug(text, lo, hi, more_to_come, score_cjk, hitbuffer,
 | |
|                scoringcontext, cspan, chunksummary);
 | |
|   }
 | |
| 
 | |
|   scoringcontext->prior_chunk_lang = static_cast<Language>(chunksummary->lang1);
 | |
| }
 | |
| 
 | |
| 
 | |
| // Score chunks of text described by hitbuffer, allowing each to be in a
 | |
| // different language, and optionally adjusting the boundaries inbetween.
 | |
| // Set last_cspan to the last chunkspan used
 | |
| void ScoreAllHits(const char* text,  ULScript ulscript,
 | |
|                   bool more_to_come, bool score_cjk,
 | |
|                   const ScoringHitBuffer* hitbuffer,
 | |
|                   ScoringContext* scoringcontext,
 | |
|                   SummaryBuffer* summarybuffer, ChunkSpan* last_cspan) {
 | |
|   ChunkSpan prior_cspan = {0, 0, 0, 0, 0, 0};
 | |
|   ChunkSpan cspan = {0, 0, 0, 0, 0, 0};
 | |
| 
 | |
|   for (int i = 0; i < hitbuffer->next_chunk_start; ++i) {
 | |
|     // Score one chunk
 | |
|     // Sets delta_len, and distinct_len
 | |
|     Tote chunk_tote;
 | |
|     ChunkSummary chunksummary;
 | |
|     ScoreOneChunk(text, ulscript,
 | |
|                   hitbuffer, i,
 | |
|                   scoringcontext, &cspan, &chunk_tote, &chunksummary);
 | |
| 
 | |
|     // Put result in summarybuffer
 | |
|     if (summarybuffer->n < kMaxSummaries) {
 | |
|       summarybuffer->chunksummary[summarybuffer->n] = chunksummary;
 | |
|       summarybuffer->n += 1;
 | |
|     }
 | |
| 
 | |
|     prior_cspan = cspan;
 | |
|     cspan.chunk_base += cspan.base_len;
 | |
|     cspan.chunk_delta += cspan.delta_len;
 | |
|     cspan.chunk_distinct += cspan.distinct_len;
 | |
|   }
 | |
| 
 | |
|   // Add one dummy off the end to hold first unused linear_in_chunk
 | |
|   int linear_off_end = hitbuffer->next_linear;
 | |
|   int offset_off_end = hitbuffer->linear[linear_off_end].offset;
 | |
|   ChunkSummary* cs = &summarybuffer->chunksummary[summarybuffer->n];
 | |
|   memset(cs, 0, sizeof(ChunkSummary));
 | |
|   cs->offset = offset_off_end;
 | |
|   cs->chunk_start = linear_off_end;
 | |
|   *last_cspan = prior_cspan;
 | |
| }
 | |
| 
 | |
| 
 | |
| void SummaryBufferToDocTote(const SummaryBuffer* summarybuffer,
 | |
|                             bool more_to_come, DocTote* doc_tote) {
 | |
|   int cs_bytes_sum = 0;
 | |
|   for (int i = 0; i < summarybuffer->n; ++i) {
 | |
|     const ChunkSummary* cs = &summarybuffer->chunksummary[i];
 | |
|     int reliability = minint(cs->reliability_delta, cs->reliability_score);
 | |
|     // doc_tote uses full languages
 | |
|     doc_tote->Add(cs->lang1, cs->bytes, cs->score1, reliability);
 | |
|     cs_bytes_sum += cs->bytes;
 | |
|   }
 | |
| }
 | |
| 
 | |
| // Turn on for debugging vectors
 | |
| static const bool kShowLettersOriginal = false;
 | |
| 
 | |
| 
 | |
| // If next chunk language matches last vector language, extend last element
 | |
| // Otherwise add new element to vector
 | |
| void ItemToVector(ScriptScanner* scanner,
 | |
|                   ResultChunkVector* vec, Language new_lang,
 | |
|                   int mapped_offset, int mapped_len) {
 | |
|   uint16 last_vec_lang = static_cast<uint16>(UNKNOWN_LANGUAGE);
 | |
|   int last_vec_subscr = vec->size() - 1;
 | |
|   if (last_vec_subscr >= 0) {
 | |
|     ResultChunk* priorrc = &(*vec)[last_vec_subscr];
 | |
|     last_vec_lang = priorrc->lang1;
 | |
|     if (new_lang == last_vec_lang) {
 | |
|       // Extend prior. Current mapped_offset may be beyond prior end, so do
 | |
|       // the arithmetic to include any such gap
 | |
|       priorrc->bytes = minint((mapped_offset + mapped_len) - priorrc->offset,
 | |
|                               kMaxResultChunkBytes);
 | |
|       if (kShowLettersOriginal) {
 | |
|         // Optionally print the new chunk original text
 | |
|         string temp2(&scanner->GetBufferStart()[priorrc->offset],
 | |
|                      priorrc->bytes);
 | |
|         fprintf(stderr, "Item[%d..%d) '%s'<br>\n",
 | |
|                 priorrc->offset, priorrc->offset + priorrc->bytes,
 | |
|                 GetHtmlEscapedText(temp2).c_str());
 | |
|       }
 | |
|       return;
 | |
|     }
 | |
|   }
 | |
|   // Add new vector element
 | |
|   ResultChunk rc;
 | |
|   rc.offset = mapped_offset;
 | |
|   rc.bytes = minint(mapped_len, kMaxResultChunkBytes);
 | |
|   rc.lang1 = static_cast<uint16>(new_lang);
 | |
|   vec->push_back(rc);
 | |
|   if (kShowLettersOriginal) {
 | |
|     // Optionally print the new chunk original text
 | |
|     string temp2(&scanner->GetBufferStart()[rc.offset], rc.bytes);
 | |
|     fprintf(stderr, "Item[%d..%d) '%s'<br>\n",
 | |
|             rc.offset, rc.offset + rc.bytes,
 | |
|             GetHtmlEscapedText(temp2).c_str());
 | |
|   }
 | |
| }
 | |
| 
 | |
| uint16 PriorVecLang(const ResultChunkVector* vec) {
 | |
|   if (vec->empty()) {return static_cast<uint16>(UNKNOWN_LANGUAGE);}
 | |
|   return (*vec)[vec->size() - 1].lang1;
 | |
| }
 | |
| 
 | |
| uint16 NextChunkLang(const SummaryBuffer* summarybuffer, int i) {
 | |
|   if ((i + 1) >= summarybuffer->n) {
 | |
|     return static_cast<uint16>(UNKNOWN_LANGUAGE);
 | |
|   }
 | |
|   return summarybuffer->chunksummary[i + 1].lang1;
 | |
| }
 | |
| 
 | |
| 
 | |
| 
 | |
| // Add n elements of summarybuffer to resultchunk vector:
 | |
| // Each element is letters-only text [offset..offset+bytes)
 | |
| // This maps back to original[Back(offset)..Back(offset+bytes))
 | |
| //
 | |
| // We go out of our way to minimize the variation in the ResultChunkVector,
 | |
| // so that the caller has fewer but more meaningful spans in different
 | |
| // lanaguges, for the likely purpose of translation or spell-check.
 | |
| //
 | |
| // The language of each chunk is lang1, but it might be unreliable for
 | |
| // either of two reasons: its score is relatively too close to the score of
 | |
| // lang2, or its score is too far away from the expected score of real text in
 | |
| // the given language. Unreliable languages are mapped to Unknown.
 | |
| //
 | |
| void SummaryBufferToVector(ScriptScanner* scanner, const char* text,
 | |
|                            const SummaryBuffer* summarybuffer,
 | |
|                            bool more_to_come, ResultChunkVector* vec) {
 | |
|   if (vec == NULL) {return;}
 | |
| 
 | |
|   if (kShowLettersOriginal) {
 | |
|     fprintf(stderr, "map2original_ ");
 | |
|     scanner->map2original_.DumpWindow();
 | |
|     fprintf(stderr, "<br>\n");
 | |
|     fprintf(stderr, "map2uplow_ ");
 | |
|     scanner->map2uplow_.DumpWindow();
 | |
|     fprintf(stderr, "<br>\n");
 | |
|   }
 | |
| 
 | |
|   for (int i = 0; i < summarybuffer->n; ++i) {
 | |
|     const ChunkSummary* cs = &summarybuffer->chunksummary[i];
 | |
|     int unmapped_offset = cs->offset;
 | |
|     int unmapped_len = cs->bytes;
 | |
| 
 | |
|     if (kShowLettersOriginal) {
 | |
|       // Optionally print the chunk lowercase letters/marks text
 | |
|       string temp(&text[unmapped_offset], unmapped_len);
 | |
|       fprintf(stderr, "Letters [%d..%d) '%s'<br>\n",
 | |
|               unmapped_offset, unmapped_offset + unmapped_len,
 | |
|               GetHtmlEscapedText(temp).c_str());
 | |
|     }
 | |
| 
 | |
|     int mapped_offset = scanner->MapBack(unmapped_offset);
 | |
| 
 | |
|     // Trim back a little to prefer splicing original at word boundaries
 | |
|     if (mapped_offset > 0) {
 | |
|       // Size of prior vector entry, if any
 | |
|       int prior_size = 0;
 | |
|       if (!vec->empty()) {
 | |
|         ResultChunk* rc = &(*vec)[vec->size() - 1];
 | |
|         prior_size = rc->bytes;
 | |
|       }
 | |
|       // Maximum back up size to leave at least 3 bytes in prior,
 | |
|       // and not entire buffer, and no more than 12 bytes total backup
 | |
|       int n_limit = minint(prior_size - 3, mapped_offset);
 | |
|       n_limit = minint(n_limit, 12);
 | |
| 
 | |
|       // Backscan over letters, stopping if prior byte is < 0x41
 | |
|       // There is some possibility that we will backscan over a different script
 | |
|       const char* s = &scanner->GetBufferStart()[mapped_offset];
 | |
|       const unsigned char* us = reinterpret_cast<const unsigned char*>(s);
 | |
|       int n = 0;
 | |
|       while ((n < n_limit) && (us[-n - 1] >= 0x41)) {++n;}
 | |
|       if (n >= n_limit) {n = 0;} // New boundary not found within range
 | |
| 
 | |
|       // Also back up exactly one leading punctuation character if '"#@
 | |
|       if (n < n_limit) {
 | |
|         unsigned char c = us[-n - 1];
 | |
|         if ((c == '\'') || (c == '"') || (c == '#') || (c == '@')) {++n;}
 | |
|       }
 | |
|       // Shrink the previous chunk slightly
 | |
|       if (n > 0) {
 | |
|         ResultChunk* rc = &(*vec)[vec->size() - 1];
 | |
|         rc->bytes -= n;
 | |
|         mapped_offset -= n;
 | |
|         if (kShowLettersOriginal) {
 | |
|           fprintf(stderr, "Back up %d bytes<br>\n", n);
 | |
|           // Optionally print the prior chunk original text
 | |
|           string temp2(&scanner->GetBufferStart()[rc->offset], rc->bytes);
 | |
|           fprintf(stderr, "Prior   [%d..%d) '%s'<br>\n",
 | |
|                   rc->offset, rc->offset + rc->bytes,
 | |
|                   GetHtmlEscapedText(temp2).c_str());
 | |
|         }
 | |
|       }
 | |
|     }
 | |
| 
 | |
|     int mapped_len =
 | |
|       scanner->MapBack(unmapped_offset + unmapped_len) - mapped_offset;
 | |
| 
 | |
|     if (kShowLettersOriginal) {
 | |
|       // Optionally print the chunk original text
 | |
|       string temp2(&scanner->GetBufferStart()[mapped_offset], mapped_len);
 | |
|       fprintf(stderr, "Original[%d..%d) '%s'<br>\n",
 | |
|               mapped_offset, mapped_offset + mapped_len,
 | |
|               GetHtmlEscapedText(temp2).c_str());
 | |
|     }
 | |
| 
 | |
|     Language new_lang = static_cast<Language>(cs->lang1);
 | |
|     bool reliability_delta_bad =
 | |
|       (cs->reliability_delta < kUnreliablePercentThreshold);
 | |
|     bool reliability_score_bad =
 | |
|       (cs->reliability_score < kUnreliablePercentThreshold);
 | |
| 
 | |
|     // If the top language matches last vector, ignore reliability_delta
 | |
|     uint16 prior_lang = PriorVecLang(vec);
 | |
|     if (prior_lang == cs->lang1) {
 | |
|       reliability_delta_bad = false;
 | |
|     }
 | |
|     // If the top language is in same close set as last vector, set up to merge
 | |
|     if (SameCloseSet(cs->lang1, prior_lang)) {
 | |
|       new_lang = static_cast<Language>(prior_lang);
 | |
|       reliability_delta_bad = false;
 | |
|     }
 | |
|     // If the top two languages are in the same close set and the last vector
 | |
|     // language is the second language, set up to merge
 | |
|     if (SameCloseSet(cs->lang1, cs->lang2) &&
 | |
|         (prior_lang == cs->lang2)) {
 | |
|       new_lang = static_cast<Language>(prior_lang);
 | |
|       reliability_delta_bad = false;
 | |
|     }
 | |
|     // If unreliable and the last and next vector languages are both
 | |
|     // the second language, set up to merge
 | |
|     uint16 next_lang = NextChunkLang(summarybuffer, i);
 | |
|     if (reliability_delta_bad &&
 | |
|         (prior_lang == cs->lang2) && (next_lang == cs->lang2)) {
 | |
|       new_lang = static_cast<Language>(prior_lang);
 | |
|       reliability_delta_bad = false;
 | |
|     }
 | |
| 
 | |
|     if (reliability_delta_bad || reliability_score_bad) {
 | |
|       new_lang = UNKNOWN_LANGUAGE;
 | |
|     }
 | |
|     ItemToVector(scanner, vec, new_lang, mapped_offset, mapped_len);
 | |
|   }
 | |
| }
 | |
| 
 | |
| // Add just one element to resultchunk vector:
 | |
| // For RTypeNone or RTypeOne
 | |
| void JustOneItemToVector(ScriptScanner* scanner, const char* text,
 | |
|                          Language lang1, int unmapped_offset, int unmapped_len,
 | |
|                          ResultChunkVector* vec) {
 | |
|   if (vec == NULL) {return;}
 | |
| 
 | |
|   if (kShowLettersOriginal) {
 | |
|     fprintf(stderr, "map2original_ ");
 | |
|     scanner->map2original_.DumpWindow();
 | |
|     fprintf(stderr, "<br>\n");
 | |
|     fprintf(stderr, "map2uplow_ ");
 | |
|     scanner->map2uplow_.DumpWindow();
 | |
|     fprintf(stderr, "<br>\n");
 | |
|   }
 | |
| 
 | |
|   if (kShowLettersOriginal) {
 | |
|    // Optionally print the chunk lowercase letters/marks text
 | |
|    string temp(&text[unmapped_offset], unmapped_len);
 | |
|    fprintf(stderr, "Letters1 [%d..%d) '%s'<br>\n",
 | |
|            unmapped_offset, unmapped_offset + unmapped_len,
 | |
|            GetHtmlEscapedText(temp).c_str());
 | |
|   }
 | |
| 
 | |
|   int mapped_offset = scanner->MapBack(unmapped_offset);
 | |
|   int mapped_len =
 | |
|     scanner->MapBack(unmapped_offset + unmapped_len) - mapped_offset;
 | |
| 
 | |
|   if (kShowLettersOriginal) {
 | |
|     // Optionally print the chunk original text
 | |
|     string temp2(&scanner->GetBufferStart()[mapped_offset], mapped_len);
 | |
|     fprintf(stderr, "Original1[%d..%d) '%s'<br>\n",
 | |
|             mapped_offset, mapped_offset + mapped_len,
 | |
|             GetHtmlEscapedText(temp2).c_str());
 | |
|   }
 | |
| 
 | |
|   ItemToVector(scanner, vec, lang1, mapped_offset, mapped_len);
 | |
| }
 | |
| 
 | |
| 
 | |
| // Debugging. Not thread safe. Defined in getonescriptspan
 | |
| char* DisplayPiece(const char* next_byte_, int byte_length_);
 | |
| 
 | |
| // If high bit is on, take out high bit and add 2B to make table2 entries easy
 | |
| inline int PrintableIndirect(int x) {
 | |
|   if ((x & 0x80000000u) != 0) {
 | |
|     return (x & ~0x80000000u) + 2000000000;
 | |
|   }
 | |
|   return x;
 | |
| }
 | |
| void DumpHitBuffer(FILE* df, const char* text,
 | |
|                    const ScoringHitBuffer* hitbuffer) {
 | |
|   fprintf(df,
 | |
|           "<br>DumpHitBuffer[%s, next_base/delta/distinct %d, %d, %d)<br>\n",
 | |
|           ULScriptCode(hitbuffer->ulscript),
 | |
|           hitbuffer->next_base, hitbuffer->next_delta,
 | |
|           hitbuffer->next_distinct);
 | |
|   for (int i = 0; i < hitbuffer->maxscoringhits; ++i) {
 | |
|     if (i < hitbuffer->next_base) {
 | |
|       fprintf(df, "Q[%d]%d,%d,%s ",
 | |
|               i, hitbuffer->base[i].offset,
 | |
|               PrintableIndirect(hitbuffer->base[i].indirect),
 | |
|               DisplayPiece(&text[hitbuffer->base[i].offset], 6));
 | |
|     }
 | |
|     if (i < hitbuffer->next_delta) {
 | |
|       fprintf(df, "DL[%d]%d,%d,%s ",
 | |
|               i, hitbuffer->delta[i].offset, hitbuffer->delta[i].indirect,
 | |
|               DisplayPiece(&text[hitbuffer->delta[i].offset], 12));
 | |
|     }
 | |
|     if (i < hitbuffer->next_distinct) {
 | |
|       fprintf(df, "D[%d]%d,%d,%s ",
 | |
|               i, hitbuffer->distinct[i].offset, hitbuffer->distinct[i].indirect,
 | |
|               DisplayPiece(&text[hitbuffer->distinct[i].offset], 12));
 | |
|     }
 | |
|     if (i < hitbuffer->next_base) {
 | |
|       fprintf(df, "<br>\n");
 | |
|     }
 | |
|     if (i > 50) {break;}
 | |
|   }
 | |
|   if (hitbuffer->next_base > 50) {
 | |
|     int i = hitbuffer->next_base;
 | |
|     fprintf(df, "Q[%d]%d,%d,%s ",
 | |
|             i, hitbuffer->base[i].offset,
 | |
|             PrintableIndirect(hitbuffer->base[i].indirect),
 | |
|             DisplayPiece(&text[hitbuffer->base[i].offset], 6));
 | |
|   }
 | |
|   if (hitbuffer->next_delta > 50) {
 | |
|     int i = hitbuffer->next_delta;
 | |
|     fprintf(df, "DL[%d]%d,%d,%s ",
 | |
|             i, hitbuffer->delta[i].offset, hitbuffer->delta[i].indirect,
 | |
|             DisplayPiece(&text[hitbuffer->delta[i].offset], 12));
 | |
|   }
 | |
|   if (hitbuffer->next_distinct > 50) {
 | |
|     int i = hitbuffer->next_distinct;
 | |
|     fprintf(df, "D[%d]%d,%d,%s ",
 | |
|             i, hitbuffer->distinct[i].offset, hitbuffer->distinct[i].indirect,
 | |
|             DisplayPiece(&text[hitbuffer->distinct[i].offset], 12));
 | |
|   }
 | |
|   fprintf(df, "<br>\n");
 | |
| }
 | |
| 
 | |
| 
 | |
| void DumpLinearBuffer(FILE* df, const char* text,
 | |
|                       const ScoringHitBuffer* hitbuffer) {
 | |
|   fprintf(df, "<br>DumpLinearBuffer[%d)<br>\n",
 | |
|           hitbuffer->next_linear);
 | |
|   // Include the dummy entry off the end
 | |
|   for (int i = 0; i < hitbuffer->next_linear + 1; ++i) {
 | |
|     if ((50 < i) && (i < (hitbuffer->next_linear - 1))) {continue;}
 | |
|     fprintf(df, "[%d]%d,%c=%08x,%s<br>\n",
 | |
|             i, hitbuffer->linear[i].offset,
 | |
|             "UQLD"[hitbuffer->linear[i].type],
 | |
|             hitbuffer->linear[i].langprob,
 | |
|             DisplayPiece(&text[hitbuffer->linear[i].offset], 6));
 | |
|   }
 | |
|   fprintf(df, "<br>\n");
 | |
| 
 | |
|   fprintf(df, "DumpChunkStart[%d]<br>\n", hitbuffer->next_chunk_start);
 | |
|   for (int i = 0; i < hitbuffer->next_chunk_start + 1; ++i) {
 | |
|     fprintf(df, "[%d]%d\n", i, hitbuffer->chunk_start[i]);
 | |
|   }
 | |
|   fprintf(df, "<br>\n");
 | |
| }
 | |
| 
 | |
| // Move this verbose debugging output to debug.cc eventually
 | |
| void DumpChunkSummary(FILE* df, const ChunkSummary* cs) {
 | |
|   // Print chunksummary
 | |
|   fprintf(df, "%d lin[%d] %s.%d %s.%d %dB %d# %s %dRd %dRs<br>\n",
 | |
|           cs->offset,
 | |
|           cs->chunk_start,
 | |
|           LanguageCode(static_cast<Language>(cs->lang1)),
 | |
|           cs->score1,
 | |
|           LanguageCode(static_cast<Language>(cs->lang2)),
 | |
|           cs->score2,
 | |
|           cs->bytes,
 | |
|           cs->grams,
 | |
|           ULScriptCode(static_cast<ULScript>(cs->ulscript)),
 | |
|           cs->reliability_delta,
 | |
|           cs->reliability_score);
 | |
| }
 | |
| 
 | |
| void DumpSummaryBuffer(FILE* df, const SummaryBuffer* summarybuffer) {
 | |
|   fprintf(df, "<br>DumpSummaryBuffer[%d]<br>\n", summarybuffer->n);
 | |
|   fprintf(df, "[i] offset linear[chunk_start] lang.score1 lang.score2 "
 | |
|               "bytesB ngrams# script rel_delta rel_score<br>\n");
 | |
|   for (int i = 0; i <= summarybuffer->n; ++i) {
 | |
|     fprintf(df, "[%d] ", i);
 | |
|     DumpChunkSummary(df, &summarybuffer->chunksummary[i]);
 | |
|   }
 | |
|   fprintf(df, "<br>\n");
 | |
| }
 | |
| 
 | |
| 
 | |
| 
 | |
| // Within hitbufer->linear[]
 | |
| // <-- prior chunk --><-- this chunk -->
 | |
| // |                  |                 |
 | |
| // linear0            linear1           linear2
 | |
| //     lang0              lang1
 | |
| // The goal of sharpening is to move this_linear to better separate langs
 | |
| int BetterBoundary(const char* text,
 | |
|                    ScoringHitBuffer* hitbuffer,
 | |
|                    ScoringContext* scoringcontext,
 | |
|                    uint16 pslang0, uint16 pslang1,
 | |
|                    int linear0, int linear1, int linear2) {
 | |
|   // Degenerate case, no change
 | |
|   if ((linear2 - linear0) <= 8) {return linear1;}
 | |
| 
 | |
|   // Each diff gives pslang0 score - pslang1 score
 | |
|   // Running diff has four entries + + + + followed by four entries - - - -
 | |
|   // so that this value is maximal at the sharpest boundary between pslang0
 | |
|   // (positive diffs) and pslang1 (negative diffs)
 | |
|   int running_diff = 0;
 | |
|   int diff[8];    // Ring buffer of pslang0-pslang1 differences
 | |
|   // Initialize with first 8 diffs
 | |
|   for (int i = linear0; i < linear0 + 8; ++i) {
 | |
|     int j = i & 7;
 | |
|     uint32 langprob = hitbuffer->linear[i].langprob;
 | |
|     diff[j] = GetLangScore(langprob, pslang0) -
 | |
|        GetLangScore(langprob, pslang1);
 | |
|     if (i < linear0 + 4) {
 | |
|       // First four diffs pslang0 - pslang1
 | |
|       running_diff += diff[j];
 | |
|     } else {
 | |
|       // Second four diffs -(pslang0 - pslang1)
 | |
|       running_diff -= diff[j];
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   // Now scan for sharpest boundary. j is at left end of 8 entries
 | |
|   // To be a boundary, there must be both >0 and <0 entries in the window
 | |
|   int better_boundary_value = 0;
 | |
|   int better_boundary = linear1;
 | |
|   for (int i = linear0; i < linear2 - 8; ++i) {
 | |
|     int j = i & 7;
 | |
|     if (better_boundary_value < running_diff) {
 | |
|       bool has_plus = false;
 | |
|       bool has_minus = false;
 | |
|       for (int kk = 0; kk < 8; ++kk) {
 | |
|         if (diff[kk] > 0) {has_plus = true;}
 | |
|         if (diff[kk] < 0) {has_minus = true;}
 | |
|       }
 | |
|       if (has_plus && has_minus) {
 | |
|         better_boundary_value = running_diff;
 | |
|         better_boundary = i + 4;
 | |
|       }
 | |
|     }
 | |
|     // Shift right one entry
 | |
|     uint32 langprob = hitbuffer->linear[i + 8].langprob;
 | |
|     int newdiff = GetLangScore(langprob, pslang0) -
 | |
|        GetLangScore(langprob, pslang1);
 | |
|     int middiff = diff[(i + 4) & 7];
 | |
|     int olddiff = diff[j];
 | |
|     diff[j] = newdiff;
 | |
|     running_diff -= olddiff;                  // Remove left
 | |
|     running_diff += 2 * middiff;              // Convert middle from - to +
 | |
|     running_diff -= newdiff;                  // Insert right
 | |
|   }
 | |
| 
 | |
|   if (scoringcontext->flags_cld2_verbose && (linear1 != better_boundary)) {
 | |
|     Language lang0 = FromPerScriptNumber(scoringcontext->ulscript, pslang0);
 | |
|     Language lang1 = FromPerScriptNumber(scoringcontext->ulscript, pslang1);
 | |
|     fprintf(scoringcontext->debug_file, " Better lin[%d=>%d] %s^^%s <br>\n",
 | |
|             linear1, better_boundary,
 | |
|             LanguageCode(lang0), LanguageCode(lang1));
 | |
|     int lin0_off = hitbuffer->linear[linear0].offset;
 | |
|     int lin1_off = hitbuffer->linear[linear1].offset;
 | |
|     int lin2_off = hitbuffer->linear[linear2].offset;
 | |
|     int better_offm1 = hitbuffer->linear[better_boundary - 1].offset;
 | |
|     int better_off = hitbuffer->linear[better_boundary].offset;
 | |
|     int better_offp1 = hitbuffer->linear[better_boundary + 1].offset;
 | |
|     string old0(&text[lin0_off], lin1_off - lin0_off);
 | |
|     string old1(&text[lin1_off], lin2_off - lin1_off);
 | |
|     string new0(&text[lin0_off], better_offm1 - lin0_off);
 | |
|     string new0m1(&text[better_offm1], better_off - better_offm1);
 | |
|     string new1(&text[better_off], better_offp1 - better_off);
 | |
|     string new1p1(&text[better_offp1], lin2_off - better_offp1);
 | |
|     fprintf(scoringcontext->debug_file, "%s^^%s => <br>\n%s^%s^^%s^%s<br>\n",
 | |
|             GetHtmlEscapedText(old0).c_str(),
 | |
|             GetHtmlEscapedText(old1).c_str(),
 | |
|             GetHtmlEscapedText(new0).c_str(),
 | |
|             GetHtmlEscapedText(new0m1).c_str(),
 | |
|             GetHtmlEscapedText(new1).c_str(),
 | |
|             GetHtmlEscapedText(new1p1).c_str());
 | |
|     // Slow picture of differences per linear entry
 | |
|     int d;
 | |
|     for (int i = linear0; i < linear2; ++i) {
 | |
|       if (i == better_boundary) {
 | |
|         fprintf(scoringcontext->debug_file, "^^ ");
 | |
|       }
 | |
|       uint32 langprob = hitbuffer->linear[i].langprob;
 | |
|       d = GetLangScore(langprob, pslang0) - GetLangScore(langprob, pslang1);
 | |
|       const char* s = "=";
 | |
|       //if (d > 2) {s = "\xc2\xaf";}    // Macron
 | |
|       if (d > 2) {s = "#";}
 | |
|       else if (d > 0) {s = "+";}
 | |
|       else if (d < -2) {s = "_";}
 | |
|       else if (d < 0) {s = "-";}
 | |
|       fprintf(scoringcontext->debug_file, "%s ", s);
 | |
|     }
 | |
|     fprintf(scoringcontext->debug_file, "   (scale: #+=-_)<br>\n");
 | |
|   }
 | |
|   return better_boundary;
 | |
| }
 | |
| 
 | |
| 
 | |
| // For all but the first summary, if its top language differs from
 | |
| // the previous chunk, refine the boundary
 | |
| // Linearized version
 | |
| void SharpenBoundaries(const char* text,
 | |
|                        bool more_to_come,
 | |
|                        ScoringHitBuffer* hitbuffer,
 | |
|                        ScoringContext* scoringcontext,
 | |
|                        SummaryBuffer* summarybuffer) {
 | |
| 
 | |
|   int prior_linear = summarybuffer->chunksummary[0].chunk_start;
 | |
|   uint16 prior_lang = summarybuffer->chunksummary[0].lang1;
 | |
| 
 | |
|   if (scoringcontext->flags_cld2_verbose) {
 | |
|     fprintf(scoringcontext->debug_file, "<br>SharpenBoundaries<br>\n");
 | |
|   }
 | |
|   for (int i = 1; i < summarybuffer->n; ++i) {
 | |
|     ChunkSummary* cs = &summarybuffer->chunksummary[i];
 | |
|     uint16 this_lang = cs->lang1;
 | |
|     if (this_lang == prior_lang) {
 | |
|       prior_linear = cs->chunk_start;
 | |
|       continue;
 | |
|     }
 | |
| 
 | |
|     int this_linear = cs->chunk_start;
 | |
|     int next_linear = summarybuffer->chunksummary[i + 1].chunk_start;
 | |
| 
 | |
|     // If this/prior in same close set, don't move boundary
 | |
|     if (SameCloseSet(prior_lang, this_lang)) {
 | |
|       prior_linear = this_linear;
 | |
|       prior_lang = this_lang;
 | |
|       continue;
 | |
|     }
 | |
| 
 | |
| 
 | |
|     // Within hitbuffer->linear[]
 | |
|     // <-- prior chunk --><-- this chunk -->
 | |
|     // |                  |                 |
 | |
|     // prior_linear       this_linear       next_linear
 | |
|     //     prior_lang         this_lang
 | |
|     // The goal of sharpening is to move this_linear to better separate langs
 | |
| 
 | |
|     uint8 pslang0 = PerScriptNumber(scoringcontext->ulscript,
 | |
|                                     static_cast<Language>(prior_lang));
 | |
|     uint8 pslang1 = PerScriptNumber(scoringcontext->ulscript,
 | |
|                                     static_cast<Language>(this_lang));
 | |
|     int better_linear = BetterBoundary(text,
 | |
|                                        hitbuffer,
 | |
|                                        scoringcontext,
 | |
|                                        pslang0, pslang1,
 | |
|                                        prior_linear, this_linear, next_linear);
 | |
| 
 | |
|     int old_offset = hitbuffer->linear[this_linear].offset;
 | |
|     int new_offset = hitbuffer->linear[better_linear].offset;
 | |
|     cs->chunk_start = better_linear;
 | |
|     cs->offset = new_offset;
 | |
|     // If this_linear moved right, make bytes smaller for this, larger for prior
 | |
|     // If this_linear moved left, make bytes larger for this, smaller for prior
 | |
|     cs->bytes -= (new_offset - old_offset);
 | |
|     summarybuffer->chunksummary[i - 1].bytes += (new_offset - old_offset);
 | |
| 
 | |
|     this_linear = better_linear;    // Update so that next chunk doesn't intrude
 | |
| 
 | |
|     // Consider rescoring the two chunks
 | |
| 
 | |
|     // Update for next round (note: using pre-updated boundary)
 | |
|     prior_linear = this_linear;
 | |
|     prior_lang = this_lang;
 | |
|   }
 | |
| }
 | |
| 
 | |
| // Make a langprob that gives small weight to the default language for ulscript
 | |
| uint32 DefaultLangProb(ULScript ulscript) {
 | |
|   Language default_lang = DefaultLanguage(ulscript);
 | |
|   return MakeLangProb(default_lang, 1);
 | |
| }
 | |
| 
 | |
| // Effectively, do a merge-sort based on text offsets
 | |
| // Look up each indirect value in appropriate scoring table and keep
 | |
| // just the resulting langprobs
 | |
| void LinearizeAll(ScoringContext* scoringcontext, bool score_cjk,
 | |
|                   ScoringHitBuffer* hitbuffer) {
 | |
|   const CLD2TableSummary* base_obj;       // unigram or quadgram
 | |
|   const CLD2TableSummary* base_obj2;      // quadgram dual table
 | |
|   const CLD2TableSummary* delta_obj;      // bigram or octagram
 | |
|   const CLD2TableSummary* distinct_obj;   // bigram or octagram
 | |
|   uint16 base_hit;
 | |
|   if (score_cjk) {
 | |
|     base_obj = scoringcontext->scoringtables->unigram_compat_obj;
 | |
|     base_obj2 = scoringcontext->scoringtables->unigram_compat_obj;
 | |
|     delta_obj = scoringcontext->scoringtables->deltabi_obj;
 | |
|     distinct_obj = scoringcontext->scoringtables->distinctbi_obj;
 | |
|     base_hit = UNIHIT;
 | |
|   } else {
 | |
|     base_obj = scoringcontext->scoringtables->quadgram_obj;
 | |
|     base_obj2 = scoringcontext->scoringtables->quadgram_obj2;
 | |
|     delta_obj = scoringcontext->scoringtables->deltaocta_obj;
 | |
|     distinct_obj = scoringcontext->scoringtables->distinctocta_obj;
 | |
|     base_hit = QUADHIT;
 | |
|   }
 | |
| 
 | |
|   int base_limit = hitbuffer->next_base;
 | |
|   int delta_limit = hitbuffer->next_delta;
 | |
|   int distinct_limit = hitbuffer->next_distinct;
 | |
|   int base_i = 0;
 | |
|   int delta_i = 0;
 | |
|   int distinct_i = 0;
 | |
|   int linear_i = 0;
 | |
| 
 | |
|   // Start with an initial base hit for the default language for this script
 | |
|   // Inserting this avoids edge effects with no hits at all
 | |
|   hitbuffer->linear[linear_i].offset = hitbuffer->lowest_offset;
 | |
|   hitbuffer->linear[linear_i].type = base_hit;
 | |
|   hitbuffer->linear[linear_i].langprob =
 | |
|     DefaultLangProb(scoringcontext->ulscript);
 | |
|   ++linear_i;
 | |
| 
 | |
|   while ((base_i < base_limit) || (delta_i < delta_limit) ||
 | |
|          (distinct_i < distinct_limit)) {
 | |
|     int base_off = hitbuffer->base[base_i].offset;
 | |
|     int delta_off = hitbuffer->delta[delta_i].offset;
 | |
|     int distinct_off = hitbuffer->distinct[distinct_i].offset;
 | |
| 
 | |
|     // Do delta and distinct first, so that they are not lost at base_limit
 | |
|     if ((delta_i < delta_limit) &&
 | |
|         (delta_off <= base_off) && (delta_off <= distinct_off)) {
 | |
|       // Add delta entry
 | |
|       int indirect = hitbuffer->delta[delta_i].indirect;
 | |
|       ++delta_i;
 | |
|       uint32 langprob = delta_obj->kCLDTableInd[indirect];
 | |
|       if (langprob > 0) {
 | |
|         hitbuffer->linear[linear_i].offset = delta_off;
 | |
|         hitbuffer->linear[linear_i].type = DELTAHIT;
 | |
|         hitbuffer->linear[linear_i].langprob = langprob;
 | |
|         ++linear_i;
 | |
|       }
 | |
|     }
 | |
|     else if ((distinct_i < distinct_limit) &&
 | |
|              (distinct_off <= base_off) && (distinct_off <= delta_off)) {
 | |
|       // Add distinct entry
 | |
|       int indirect = hitbuffer->distinct[distinct_i].indirect;
 | |
|       ++distinct_i;
 | |
|       uint32 langprob = distinct_obj->kCLDTableInd[indirect];
 | |
|       if (langprob > 0) {
 | |
|         hitbuffer->linear[linear_i].offset = distinct_off;
 | |
|         hitbuffer->linear[linear_i].type = DISTINCTHIT;
 | |
|         hitbuffer->linear[linear_i].langprob = langprob;
 | |
|         ++linear_i;
 | |
|       }
 | |
|     }
 | |
|     else {
 | |
|       // Add one or two base entries
 | |
|       int indirect = hitbuffer->base[base_i].indirect;
 | |
|       // First, get right scoring table
 | |
|       const CLD2TableSummary* local_base_obj = base_obj;
 | |
|       if ((indirect & 0x80000000u) != 0) {
 | |
|         local_base_obj = base_obj2;
 | |
|         indirect &= ~0x80000000u;
 | |
|       }
 | |
|       ++base_i;
 | |
|       // One langprob in kQuadInd[0..SingleSize),
 | |
|       // two in kQuadInd[SingleSize..Size)
 | |
|       if (indirect < static_cast<int>(local_base_obj->kCLDTableSizeOne)) {
 | |
|         // Up to three languages at indirect
 | |
|         uint32 langprob = local_base_obj->kCLDTableInd[indirect];
 | |
|         if (langprob > 0) {
 | |
|           hitbuffer->linear[linear_i].offset = base_off;
 | |
|           hitbuffer->linear[linear_i].type = base_hit;
 | |
|           hitbuffer->linear[linear_i].langprob = langprob;
 | |
|           ++linear_i;
 | |
|         }
 | |
|       } else {
 | |
|         // Up to six languages at start + 2 * (indirect - start)
 | |
|         indirect += (indirect - local_base_obj->kCLDTableSizeOne);
 | |
|         uint32 langprob = local_base_obj->kCLDTableInd[indirect];
 | |
|         uint32 langprob2 = local_base_obj->kCLDTableInd[indirect + 1];
 | |
|         if (langprob > 0) {
 | |
|           hitbuffer->linear[linear_i].offset = base_off;
 | |
|           hitbuffer->linear[linear_i].type = base_hit;
 | |
|           hitbuffer->linear[linear_i].langprob = langprob;
 | |
|           ++linear_i;
 | |
|         }
 | |
|         if (langprob2 > 0) {
 | |
|           hitbuffer->linear[linear_i].offset = base_off;
 | |
|           hitbuffer->linear[linear_i].type = base_hit;
 | |
|           hitbuffer->linear[linear_i].langprob = langprob2;
 | |
|           ++linear_i;
 | |
|         }
 | |
|       }
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   // Update
 | |
|   hitbuffer->next_linear = linear_i;
 | |
| 
 | |
|   // Add a dummy entry off the end, just to capture final offset
 | |
|   hitbuffer->linear[linear_i].offset =
 | |
|   hitbuffer->base[hitbuffer->next_base].offset;
 | |
|   hitbuffer->linear[linear_i].langprob = 0;
 | |
| }
 | |
| 
 | |
| // Break linear array into chunks of ~20 quadgram hits or ~50 CJK unigram hits
 | |
| void ChunkAll(int letter_offset, bool score_cjk, ScoringHitBuffer* hitbuffer) {
 | |
|   int chunksize;
 | |
|   uint16 base_hit;
 | |
|   if (score_cjk) {
 | |
|     chunksize = kChunksizeUnis;
 | |
|     base_hit = UNIHIT;
 | |
|   } else {
 | |
|     chunksize = kChunksizeQuads;
 | |
|     base_hit = QUADHIT;
 | |
|   }
 | |
| 
 | |
|   int linear_i = 0;
 | |
|   int linear_off_end = hitbuffer->next_linear;
 | |
|   int text_i = letter_offset;               // Next unseen text offset
 | |
|   int next_chunk_start = 0;
 | |
|   int bases_left = hitbuffer->next_base;
 | |
|   while (bases_left > 0) {
 | |
|     // Linearize one chunk
 | |
|     int base_len = chunksize;     // Default; may be changed below
 | |
|     if (bases_left < (chunksize + (chunksize >> 1))) {
 | |
|       // If within 1.5 chunks of the end, avoid runts by using it all
 | |
|       base_len = bases_left;
 | |
|     } else if (bases_left < (2 * chunksize)) {
 | |
|       // Avoid runts by splitting 1.5 to 2 chunks in half (about 3/4 each)
 | |
|       base_len = (bases_left + 1) >> 1;
 | |
|     }
 | |
| 
 | |
|     hitbuffer->chunk_start[next_chunk_start] = linear_i;
 | |
|     hitbuffer->chunk_offset[next_chunk_start] = text_i;
 | |
|     ++next_chunk_start;
 | |
| 
 | |
|     int base_count = 0;
 | |
|     while ((base_count < base_len) && (linear_i < linear_off_end)) {
 | |
|       if (hitbuffer->linear[linear_i].type == base_hit) {++base_count;}
 | |
|       ++linear_i;
 | |
|     }
 | |
|     text_i = hitbuffer->linear[linear_i].offset;    // Next unseen text offset
 | |
|     bases_left -= base_len;
 | |
|   }
 | |
| 
 | |
|   // If no base hits at all, make a single dummy chunk
 | |
|   if (next_chunk_start == 0) {
 | |
|      hitbuffer->chunk_start[next_chunk_start] = 0;
 | |
|      hitbuffer->chunk_offset[next_chunk_start] = hitbuffer->linear[0].offset;
 | |
|      ++next_chunk_start;
 | |
|   }
 | |
| 
 | |
|   // Remember the linear array start of dummy entry
 | |
|   hitbuffer->next_chunk_start = next_chunk_start;
 | |
| 
 | |
|   // Add a dummy entry off the end, just to capture final linear subscr
 | |
|   hitbuffer->chunk_start[next_chunk_start] = hitbuffer->next_linear;
 | |
|   hitbuffer->chunk_offset[next_chunk_start] = text_i;
 | |
| }
 | |
| 
 | |
| 
 | |
| // Merge-sort the individual hit arrays, go indirect on the scoring subscripts,
 | |
| // break linear array into chunks.
 | |
| //
 | |
| // Input:
 | |
| //  hitbuffer base, delta, distinct arrays
 | |
| // Output:
 | |
| //  linear array
 | |
| //  chunk_start array
 | |
| //
 | |
| void LinearizeHitBuffer(int letter_offset,
 | |
|                         ScoringContext* scoringcontext,
 | |
|                         bool more_to_come, bool score_cjk,
 | |
|                         ScoringHitBuffer* hitbuffer) {
 | |
|   LinearizeAll(scoringcontext, score_cjk, hitbuffer);
 | |
|   ChunkAll(letter_offset, score_cjk, hitbuffer);
 | |
| }
 | |
| 
 | |
| 
 | |
| 
 | |
| // The hitbuffer is in an awkward form -- three sets of base/delta/distinct
 | |
| // scores, each with an indirect subscript to one of six scoring tables, some
 | |
| // of which can yield two langprobs for six languages, others one langprob for
 | |
| // three languages. The only correlation between base/delta/distinct is their
 | |
| // offsets into the letters-only text buffer.
 | |
| //
 | |
| // SummaryBuffer needs to be built to linear, giving linear offset of start of
 | |
| // each chunk
 | |
| //
 | |
| // So we first do all the langprob lookups and merge-sort by offset to make
 | |
| // a single linear vector, building a side vector of chunk beginnings as we go.
 | |
| // The sharpening is simply moving the beginnings, scoring is a simple linear
 | |
| // sweep, etc.
 | |
| 
 | |
| void ProcessHitBuffer(const LangSpan& scriptspan,
 | |
|                       int letter_offset,
 | |
|                       ScoringContext* scoringcontext,
 | |
|                       DocTote* doc_tote,
 | |
|                       ResultChunkVector* vec,
 | |
|                       bool more_to_come, bool score_cjk,
 | |
|                       ScoringHitBuffer* hitbuffer) {
 | |
|   if (scoringcontext->flags_cld2_verbose) {
 | |
|     fprintf(scoringcontext->debug_file, "Hitbuffer[) ");
 | |
|     DumpHitBuffer(scoringcontext->debug_file, scriptspan.text, hitbuffer);
 | |
|   }
 | |
| 
 | |
|   LinearizeHitBuffer(letter_offset, scoringcontext, more_to_come, score_cjk,
 | |
|                      hitbuffer);
 | |
| 
 | |
|   if (scoringcontext->flags_cld2_verbose) {
 | |
|     fprintf(scoringcontext->debug_file, "Linear[) ");
 | |
|     DumpLinearBuffer(scoringcontext->debug_file, scriptspan.text, hitbuffer);
 | |
|   }
 | |
| 
 | |
|   SummaryBuffer summarybuffer;
 | |
|   summarybuffer.n = 0;
 | |
|   ChunkSpan last_cspan;
 | |
|   ScoreAllHits(scriptspan.text, scriptspan.ulscript,
 | |
|                     more_to_come, score_cjk, hitbuffer,
 | |
|                     scoringcontext,
 | |
|                     &summarybuffer, &last_cspan);
 | |
| 
 | |
|   if (scoringcontext->flags_cld2_verbose) {
 | |
|     DumpSummaryBuffer(scoringcontext->debug_file, &summarybuffer);
 | |
|   }
 | |
| 
 | |
|   if (vec != NULL) {
 | |
|     // Sharpen boundaries of summarybuffer
 | |
|     // This is not a high-performance path
 | |
|     SharpenBoundaries(scriptspan.text, more_to_come, hitbuffer, scoringcontext,
 | |
|                       &summarybuffer);
 | |
|     // Show after the sharpening
 | |
|     // CLD2_Debug2(scriptspan.text, more_to_come, score_cjk,
 | |
|     //             hitbuffer, scoringcontext, &summarybuffer);
 | |
| 
 | |
|     if (scoringcontext->flags_cld2_verbose) {
 | |
|       DumpSummaryBuffer(scoringcontext->debug_file, &summarybuffer);
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   SummaryBufferToDocTote(&summarybuffer, more_to_come, doc_tote);
 | |
|   SummaryBufferToVector(scoringcontext->scanner, scriptspan.text,
 | |
|                         &summarybuffer, more_to_come, vec);
 | |
| }
 | |
| 
 | |
| void SpliceHitBuffer(ScoringHitBuffer* hitbuffer, int next_offset) {
 | |
|   // Splice hitbuffer and summarybuffer for next round. With big chunks and
 | |
|   // distinctive-word state carried across chunks, we might not need to do this.
 | |
|   hitbuffer->next_base = 0;
 | |
|   hitbuffer->next_delta = 0;
 | |
|   hitbuffer->next_distinct = 0;
 | |
|   hitbuffer->next_linear = 0;
 | |
|   hitbuffer->next_chunk_start = 0;
 | |
|   hitbuffer->lowest_offset = next_offset;
 | |
| }
 | |
| 
 | |
| 
 | |
| // Score RTypeNone or RTypeOne scriptspan into doc_tote and vec, updating
 | |
| // scoringcontext
 | |
| void ScoreEntireScriptSpan(const LangSpan& scriptspan,
 | |
|                            ScoringContext* scoringcontext,
 | |
|                            DocTote* doc_tote,
 | |
|                            ResultChunkVector* vec) {
 | |
|   int bytes = scriptspan.text_bytes;
 | |
|   // Artificially set score to 1024 per 1KB, or 1 per byte
 | |
|   int score = bytes;
 | |
|   int reliability = 100;
 | |
|   // doc_tote uses full languages
 | |
|   Language one_one_lang = DefaultLanguage(scriptspan.ulscript);
 | |
|   doc_tote->Add(one_one_lang, bytes, score, reliability);
 | |
| 
 | |
|   if (scoringcontext->flags_cld2_html) {
 | |
|     ChunkSummary chunksummary = {
 | |
|       1, 0,
 | |
|       one_one_lang, UNKNOWN_LANGUAGE, score, 1,
 | |
|       bytes, 0, scriptspan.ulscript, reliability, reliability
 | |
|     };
 | |
|     CLD2_Debug(scriptspan.text, 1, scriptspan.text_bytes,
 | |
|                false, false, NULL,
 | |
|                scoringcontext, NULL, &chunksummary);
 | |
|   }
 | |
| 
 | |
|   // First byte is always a space
 | |
|   JustOneItemToVector(scoringcontext->scanner, scriptspan.text,
 | |
|                       one_one_lang, 1, bytes - 1, vec);
 | |
| 
 | |
|   scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE;
 | |
| }
 | |
| 
 | |
| // Score RTypeCJK scriptspan into doc_tote and vec, updating scoringcontext
 | |
| void ScoreCJKScriptSpan(const LangSpan& scriptspan,
 | |
|                         ScoringContext* scoringcontext,
 | |
|                         DocTote* doc_tote,
 | |
|                         ResultChunkVector* vec) {
 | |
|   // Allocate three parallel arrays of scoring hits
 | |
|   ScoringHitBuffer* hitbuffer = new ScoringHitBuffer;
 | |
|   hitbuffer->init();
 | |
|   hitbuffer->ulscript = scriptspan.ulscript;
 | |
| 
 | |
|   scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE;
 | |
|   scoringcontext->oldest_distinct_boost = 0;
 | |
| 
 | |
|   // Incoming scriptspan has a single leading space at scriptspan.text[0]
 | |
|   // and three trailing spaces then NUL at scriptspan.text[text_bytes + 0/1/2/3]
 | |
| 
 | |
|   int letter_offset = 1;        // Skip initial space
 | |
|   hitbuffer->lowest_offset = letter_offset;
 | |
|   int letter_limit = scriptspan.text_bytes;
 | |
|   while (letter_offset < letter_limit) {
 | |
|     if (scoringcontext->flags_cld2_verbose) {
 | |
|       fprintf(scoringcontext->debug_file, " ScoreCJKScriptSpan[%d,%d)<br>\n",
 | |
|               letter_offset, letter_limit);
 | |
|     }
 | |
|     //
 | |
|     // Fill up one hitbuffer, possibly splicing onto previous fragment
 | |
|     //
 | |
|     // NOTE: GetUniHits deals with close repeats
 | |
|     // NOTE: After last chunk there is always a hitbuffer entry with an offset
 | |
|     // just off the end of the text = next_offset.
 | |
|     int next_offset = GetUniHits(scriptspan.text, letter_offset, letter_limit,
 | |
|                                   scoringcontext, hitbuffer);
 | |
|     // NOTE: GetBiHitVectors deals with close repeats,
 | |
|     // does one hash and two lookups (delta and distinct) per word
 | |
|     GetBiHits(scriptspan.text, letter_offset, next_offset,
 | |
|                 scoringcontext, hitbuffer);
 | |
| 
 | |
|     //
 | |
|     // Score one hitbuffer in chunks to summarybuffer
 | |
|     //
 | |
|     bool more_to_come = next_offset < letter_limit;
 | |
|     bool score_cjk = true;
 | |
|     ProcessHitBuffer(scriptspan, letter_offset, scoringcontext, doc_tote, vec,
 | |
|                      more_to_come, score_cjk, hitbuffer);
 | |
|     SpliceHitBuffer(hitbuffer, next_offset);
 | |
| 
 | |
|     letter_offset = next_offset;
 | |
|   }
 | |
| 
 | |
|   delete hitbuffer;
 | |
|   // Context across buffers is not connected yet
 | |
|   scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE;
 | |
| }
 | |
| 
 | |
| 
 | |
| 
 | |
| // Score RTypeMany scriptspan into doc_tote and vec, updating scoringcontext
 | |
| // We have a scriptspan with all lowercase text in one script. Look up
 | |
| // quadgrams and octagrams, saving the hits in three parallel vectors.
 | |
| // Score from those vectors in chunks, toting each chunk to get a single
 | |
| // language, and combining into the overall document score. The hit vectors
 | |
| // in general are not big enough to handle and entire scriptspan, so
 | |
| // repeat until the entire scriptspan is scored.
 | |
| // Caller deals with minimizing numbr of runt scriptspans
 | |
| // This routine deals with minimizing number of runt chunks.
 | |
| //
 | |
| // Returns updated scoringcontext
 | |
| // Returns updated doc_tote
 | |
| // If vec != NULL, appends to that vector of ResultChunk's
 | |
| void ScoreQuadScriptSpan(const LangSpan& scriptspan,
 | |
|                          ScoringContext* scoringcontext,
 | |
|                          DocTote* doc_tote,
 | |
|                          ResultChunkVector* vec) {
 | |
|   // Allocate three parallel arrays of scoring hits
 | |
|   ScoringHitBuffer* hitbuffer = new ScoringHitBuffer;
 | |
|   hitbuffer->init();
 | |
|   hitbuffer->ulscript = scriptspan.ulscript;
 | |
| 
 | |
|   scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE;
 | |
|   scoringcontext->oldest_distinct_boost = 0;
 | |
| 
 | |
|   // Incoming scriptspan has a single leading space at scriptspan.text[0]
 | |
|   // and three trailing spaces then NUL at scriptspan.text[text_bytes + 0/1/2/3]
 | |
| 
 | |
|   int letter_offset = 1;        // Skip initial space
 | |
|   hitbuffer->lowest_offset = letter_offset;
 | |
|   int letter_limit = scriptspan.text_bytes;
 | |
|   while (letter_offset < letter_limit) {
 | |
|     //
 | |
|     // Fill up one hitbuffer, possibly splicing onto previous fragment
 | |
|     //
 | |
|     // NOTE: GetQuadHits deals with close repeats
 | |
|     // NOTE: After last chunk there is always a hitbuffer entry with an offset
 | |
|     // just off the end of the text = next_offset.
 | |
|     int next_offset = GetQuadHits(scriptspan.text, letter_offset, letter_limit,
 | |
|                                   scoringcontext, hitbuffer);
 | |
|     // If true, there is more text to process in this scriptspan
 | |
|     // NOTE: GetOctaHitVectors deals with close repeats,
 | |
|     // does one hash and two lookups (delta and distinct) per word
 | |
|     GetOctaHits(scriptspan.text, letter_offset, next_offset,
 | |
|                 scoringcontext, hitbuffer);
 | |
| 
 | |
|     //
 | |
|     // Score one hitbuffer in chunks to summarybuffer
 | |
|     //
 | |
|     bool more_to_come = next_offset < letter_limit;
 | |
|     bool score_cjk = false;
 | |
|     ProcessHitBuffer(scriptspan, letter_offset, scoringcontext, doc_tote, vec,
 | |
|                      more_to_come, score_cjk, hitbuffer);
 | |
|     SpliceHitBuffer(hitbuffer, next_offset);
 | |
| 
 | |
|     letter_offset = next_offset;
 | |
|   }
 | |
| 
 | |
|   delete hitbuffer;
 | |
| }
 | |
| 
 | |
| 
 | |
| // Score one scriptspan into doc_tote and vec, updating scoringcontext
 | |
| // Inputs:
 | |
| //  One scriptspan of perhaps 40-60KB, all same script lower-case letters
 | |
| //    and single ASCII spaces. First character is a space to allow simple
 | |
| //    begining-of-word detect. End of buffer has three spaces and NUL to
 | |
| //    allow easy scan-to-end-of-word.
 | |
| //  Scoring context of
 | |
| //    scoring tables
 | |
| //    flags
 | |
| //    running boosts
 | |
| // Outputs:
 | |
| //  Updated doc_tote giving overall languages and byte counts
 | |
| //  Optional updated chunk vector giving offset, length, language
 | |
| //
 | |
| // Caller initializes flags, boosts, doc_tote and vec.
 | |
| // Caller aggregates across multiple scriptspans
 | |
| // Caller calculates final document result
 | |
| // Caller deals with detecting and triggering suppression of repeated text.
 | |
| //
 | |
| // This top-level routine just chooses the recognition type and calls one of
 | |
| // the next-level-down routines.
 | |
| //
 | |
| void ScoreOneScriptSpan(const LangSpan& scriptspan,
 | |
|                         ScoringContext* scoringcontext,
 | |
|                         DocTote* doc_tote,
 | |
|                         ResultChunkVector* vec) {
 | |
|   if (scoringcontext->flags_cld2_verbose) {
 | |
|     fprintf(scoringcontext->debug_file, "<br>ScoreOneScriptSpan(%s,%d) ",
 | |
|             ULScriptCode(scriptspan.ulscript), scriptspan.text_bytes);
 | |
|     // Optionally print the chunk lowercase letters/marks text
 | |
|     string temp(&scriptspan.text[0], scriptspan.text_bytes);
 | |
|     fprintf(scoringcontext->debug_file, "'%s'",
 | |
|             GetHtmlEscapedText(temp).c_str());
 | |
|     fprintf(scoringcontext->debug_file, "<br>\n");
 | |
|   }
 | |
|   scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE;
 | |
|   scoringcontext->oldest_distinct_boost = 0;
 | |
|   ULScriptRType rtype = ULScriptRecognitionType(scriptspan.ulscript);
 | |
|   if (scoringcontext->flags_cld2_score_as_quads && (rtype != RTypeCJK)) {
 | |
|     rtype = RTypeMany;
 | |
|   }
 | |
|   switch (rtype) {
 | |
|   case RTypeNone:
 | |
|   case RTypeOne:
 | |
|     ScoreEntireScriptSpan(scriptspan, scoringcontext, doc_tote, vec);
 | |
|     break;
 | |
|   case RTypeCJK:
 | |
|     ScoreCJKScriptSpan(scriptspan, scoringcontext, doc_tote, vec);
 | |
|     break;
 | |
|   case RTypeMany:
 | |
|     ScoreQuadScriptSpan(scriptspan, scoringcontext, doc_tote, vec);
 | |
|     break;
 | |
|   }
 | |
| }
 | |
| 
 | |
| }       // End namespace CLD2
 | |
| 
 | 
