forked from mirrors/gecko-dev
		
	
		
			
				
	
	
		
			322 lines
		
	
	
	
		
			11 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			322 lines
		
	
	
	
		
			11 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
| // Copyright 2013 Google Inc. All Rights Reserved.
 | |
| //
 | |
| // Licensed under the Apache License, Version 2.0 (the "License");
 | |
| // you may not use this file except in compliance with the License.
 | |
| // You may obtain a copy of the License at
 | |
| //
 | |
| //     http://www.apache.org/licenses/LICENSE-2.0
 | |
| //
 | |
| // Unless required by applicable law or agreed to in writing, software
 | |
| // distributed under the License is distributed on an "AS IS" BASIS,
 | |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | |
| // See the License for the specific language governing permissions and
 | |
| // limitations under the License.
 | |
| 
 | |
| //
 | |
| // Author: dsites@google.com (Dick Sites)
 | |
| //
 | |
| 
 | |
| #include <stdio.h>
 | |
| #include <stdlib.h>
 | |
| 
 | |
| #include "../public/compact_lang_det.h"
 | |
| #include "../public/encodings.h"
 | |
| #include "compact_lang_det_impl.h"
 | |
| #include "integral_types.h"
 | |
| #include "lang_script.h"
 | |
| 
 | |
| namespace CLD2 {
 | |
| 
 | |
| // String is "code_version - data_scrape_date"
 | |
| //static const char* kDetectLanguageVersion = "V2.0 - 20130715";
 | |
| 
 | |
| 
 | |
| // Large-table version for all ~160 languages
 | |
| // Small-table version for all ~60 languages
 | |
| 
 | |
| // Scan interchange-valid UTF-8 bytes and detect most likely language
 | |
| Language DetectLanguage(
 | |
|                           const char* buffer,
 | |
|                           int buffer_length,
 | |
|                           bool is_plain_text,
 | |
|                           bool* is_reliable) {
 | |
|   bool allow_extended_lang = false;
 | |
|   Language language3[3];
 | |
|   int percent3[3];
 | |
|   double normalized_score3[3];
 | |
|   int text_bytes;
 | |
|   int flags = 0;
 | |
|   Language plus_one = UNKNOWN_LANGUAGE;
 | |
|   const char* tld_hint = "";
 | |
|   int encoding_hint = UNKNOWN_ENCODING;
 | |
|   Language language_hint = UNKNOWN_LANGUAGE;
 | |
|   CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
 | |
| 
 | |
|   Language lang = DetectLanguageSummaryV2(
 | |
|                           buffer,
 | |
|                           buffer_length,
 | |
|                           is_plain_text,
 | |
|                           &cldhints,
 | |
|                           allow_extended_lang,
 | |
|                           flags,
 | |
|                           plus_one,
 | |
|                           language3,
 | |
|                           percent3,
 | |
|                           normalized_score3,
 | |
|                           NULL,
 | |
|                           &text_bytes,
 | |
|                           is_reliable);
 | |
|   // Default to English
 | |
|   if (lang == UNKNOWN_LANGUAGE) {
 | |
|     lang = ENGLISH;
 | |
|   }
 | |
|   return lang;
 | |
| }
 | |
| 
 | |
| // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
 | |
| Language DetectLanguageSummary(
 | |
|                           const char* buffer,
 | |
|                           int buffer_length,
 | |
|                           bool is_plain_text,
 | |
|                           Language* language3,
 | |
|                           int* percent3,
 | |
|                           int* text_bytes,
 | |
|                           bool* is_reliable) {
 | |
|   double normalized_score3[3];
 | |
|   bool allow_extended_lang = false;
 | |
|   int flags = 0;
 | |
|   Language plus_one = UNKNOWN_LANGUAGE;
 | |
|   const char* tld_hint = "";
 | |
|   int encoding_hint = UNKNOWN_ENCODING;
 | |
|   Language language_hint = UNKNOWN_LANGUAGE;
 | |
|   CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
 | |
| 
 | |
|   Language lang = DetectLanguageSummaryV2(
 | |
|                           buffer,
 | |
|                           buffer_length,
 | |
|                           is_plain_text,
 | |
|                           &cldhints,
 | |
|                           allow_extended_lang,
 | |
|                           flags,
 | |
|                           plus_one,
 | |
|                           language3,
 | |
|                           percent3,
 | |
|                           normalized_score3,
 | |
|                           NULL,
 | |
|                           text_bytes,
 | |
|                           is_reliable);
 | |
|   // Default to English
 | |
|   if (lang == UNKNOWN_LANGUAGE) {
 | |
|     lang = ENGLISH;
 | |
|   }
 | |
|   return lang;
 | |
| }
 | |
| 
 | |
| // Same as above, with hints supplied
 | |
| // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
 | |
| Language DetectLanguageSummary(
 | |
|                           const char* buffer,
 | |
|                           int buffer_length,
 | |
|                           bool is_plain_text,
 | |
|                           const char* tld_hint,       // "id" boosts Indonesian
 | |
|                           int encoding_hint,          // SJS boosts Japanese
 | |
|                           Language language_hint,     // ITALIAN boosts it
 | |
|                           Language* language3,
 | |
|                           int* percent3,
 | |
|                           int* text_bytes,
 | |
|                           bool* is_reliable) {
 | |
|   double normalized_score3[3];
 | |
|   bool allow_extended_lang = false;
 | |
|   int flags = 0;
 | |
|   Language plus_one = UNKNOWN_LANGUAGE;
 | |
|   CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
 | |
| 
 | |
|   Language lang = DetectLanguageSummaryV2(
 | |
|                           buffer,
 | |
|                           buffer_length,
 | |
|                           is_plain_text,
 | |
|                           &cldhints,
 | |
|                           allow_extended_lang,
 | |
|                           flags,
 | |
|                           plus_one,
 | |
|                           language3,
 | |
|                           percent3,
 | |
|                           normalized_score3,
 | |
|                           NULL,
 | |
|                           text_bytes,
 | |
|                           is_reliable);
 | |
|   // Default to English
 | |
|   if (lang == UNKNOWN_LANGUAGE) {
 | |
|     lang = ENGLISH;
 | |
|   }
 | |
|   return lang;
 | |
| }
 | |
| 
 | |
| 
 | |
| // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
 | |
| // languages.
 | |
| // Extended languages are additional Google interface languages and Unicode
 | |
| // single-language scripts, from ext_lang_enc.h
 | |
| Language ExtDetectLanguageSummary(
 | |
|                           const char* buffer,
 | |
|                           int buffer_length,
 | |
|                           bool is_plain_text,
 | |
|                           Language* language3,
 | |
|                           int* percent3,
 | |
|                           int* text_bytes,
 | |
|                           bool* is_reliable) {
 | |
|   double normalized_score3[3];
 | |
|   bool allow_extended_lang = true;
 | |
|   int flags = 0;
 | |
|   Language plus_one = UNKNOWN_LANGUAGE;
 | |
|   const char* tld_hint = "";
 | |
|   int encoding_hint = UNKNOWN_ENCODING;
 | |
|   Language language_hint = UNKNOWN_LANGUAGE;
 | |
|   CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
 | |
| 
 | |
|   Language lang = DetectLanguageSummaryV2(
 | |
|                           buffer,
 | |
|                           buffer_length,
 | |
|                           is_plain_text,
 | |
|                           &cldhints,
 | |
|                           allow_extended_lang,
 | |
|                           flags,
 | |
|                           plus_one,
 | |
|                           language3,
 | |
|                           percent3,
 | |
|                           normalized_score3,
 | |
|                           NULL,
 | |
|                           text_bytes,
 | |
|                           is_reliable);
 | |
|   // Do not default to English
 | |
|   return lang;
 | |
| }
 | |
| 
 | |
| // Same as above, with hints supplied
 | |
| // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
 | |
| // languages.
 | |
| // Extended languages are additional Google interface languages and Unicode
 | |
| // single-language scripts, from ext_lang_enc.h
 | |
| Language ExtDetectLanguageSummary(
 | |
|                           const char* buffer,
 | |
|                           int buffer_length,
 | |
|                           bool is_plain_text,
 | |
|                           const char* tld_hint,       // "id" boosts Indonesian
 | |
|                           int encoding_hint,          // SJS boosts Japanese
 | |
|                           Language language_hint,     // ITALIAN boosts it
 | |
|                           Language* language3,
 | |
|                           int* percent3,
 | |
|                           int* text_bytes,
 | |
|                           bool* is_reliable) {
 | |
|   double normalized_score3[3];
 | |
|   bool allow_extended_lang = true;
 | |
|   int flags = 0;
 | |
|   Language plus_one = UNKNOWN_LANGUAGE;
 | |
|   CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
 | |
| 
 | |
|   Language lang = DetectLanguageSummaryV2(
 | |
|                           buffer,
 | |
|                           buffer_length,
 | |
|                           is_plain_text,
 | |
|                           &cldhints,
 | |
|                           allow_extended_lang,
 | |
|                           flags,
 | |
|                           plus_one,
 | |
|                           language3,
 | |
|                           percent3,
 | |
|                           normalized_score3,
 | |
|                           NULL,
 | |
|                           text_bytes,
 | |
|                           is_reliable);
 | |
|   // Do not default to English
 | |
|   return lang;
 | |
| }
 | |
| 
 | |
| // Same as above, and also returns internal language scores as a ratio to
 | |
| // normal score for real text in that language. Scores close to 1.0 indicate
 | |
| // normal text, while scores far away from 1.0 indicate badly-skewed text or
 | |
| // gibberish
 | |
| //
 | |
| Language ExtDetectLanguageSummary(
 | |
|                         const char* buffer,
 | |
|                         int buffer_length,
 | |
|                         bool is_plain_text,
 | |
|                         const char* tld_hint,       // "id" boosts Indonesian
 | |
|                         int encoding_hint,          // SJS boosts Japanese
 | |
|                         Language language_hint,     // ITALIAN boosts it
 | |
|                         Language* language3,
 | |
|                         int* percent3,
 | |
|                         double* normalized_score3,
 | |
|                         int* text_bytes,
 | |
|                         bool* is_reliable) {
 | |
|   bool allow_extended_lang = true;
 | |
|   int flags = 0;
 | |
|   Language plus_one = UNKNOWN_LANGUAGE;
 | |
|   CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
 | |
| 
 | |
|   Language lang = DetectLanguageSummaryV2(
 | |
|                           buffer,
 | |
|                           buffer_length,
 | |
|                           is_plain_text,
 | |
|                           &cldhints,
 | |
|                           allow_extended_lang,
 | |
|                           flags,
 | |
|                           plus_one,
 | |
|                           language3,
 | |
|                           percent3,
 | |
|                           normalized_score3,
 | |
|                           NULL,
 | |
|                           text_bytes,
 | |
|                           is_reliable);
 | |
|   // Do not default to English
 | |
|   return lang;
 | |
| }
 | |
| 
 | |
| // Use this one.
 | |
| // Hints are collected into a struct.
 | |
| // Flags are passed in (normally zero).
 | |
| //
 | |
| // Also returns 3 internal language scores as a ratio to
 | |
| // normal score for real text in that language. Scores close to 1.0 indicate
 | |
| // normal text, while scores far away from 1.0 indicate badly-skewed text or
 | |
| // gibberish
 | |
| //
 | |
| // Returns a vector of chunks in different languages, so that caller may
 | |
| // spell-check, translate, or otherwaise process different parts of the input
 | |
| // buffer in language-dependant ways.
 | |
| //
 | |
| Language ExtDetectLanguageSummary(
 | |
|                         const char* buffer,
 | |
|                         int buffer_length,
 | |
|                         bool is_plain_text,
 | |
|                         const CLDHints* cld_hints,
 | |
|                         int flags,
 | |
|                         Language* language3,
 | |
|                         int* percent3,
 | |
|                         double* normalized_score3,
 | |
|                         ResultChunkVector* resultchunkvector,
 | |
|                         int* text_bytes,
 | |
|                         bool* is_reliable) {
 | |
|   bool allow_extended_lang = true;
 | |
|   Language plus_one = UNKNOWN_LANGUAGE;
 | |
| 
 | |
|   Language lang = DetectLanguageSummaryV2(
 | |
|                           buffer,
 | |
|                           buffer_length,
 | |
|                           is_plain_text,
 | |
|                           cld_hints,
 | |
|                           allow_extended_lang,
 | |
|                           flags,
 | |
|                           plus_one,
 | |
|                           language3,
 | |
|                           percent3,
 | |
|                           normalized_score3,
 | |
|                           resultchunkvector,
 | |
|                           text_bytes,
 | |
|                           is_reliable);
 | |
|   // Do not default to English
 | |
|   return lang;
 | |
| }
 | |
| 
 | |
| }       // End namespace CLD2
 | |
| 
 | 
