forked from mirrors/gecko-dev
		
	
		
			
				
	
	
		
			1086 lines
		
	
	
	
		
			37 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			1086 lines
		
	
	
	
		
			37 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
| // Copyright 2013 Google Inc. All Rights Reserved.
 | |
| //
 | |
| // Licensed under the Apache License, Version 2.0 (the "License");
 | |
| // you may not use this file except in compliance with the License.
 | |
| // You may obtain a copy of the License at
 | |
| //
 | |
| //     http://www.apache.org/licenses/LICENSE-2.0
 | |
| //
 | |
| // Unless required by applicable law or agreed to in writing, software
 | |
| // distributed under the License is distributed on an "AS IS" BASIS,
 | |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | |
| // See the License for the specific language governing permissions and
 | |
| // limitations under the License.
 | |
| 
 | |
| //
 | |
| // Author: dsites@google.com (Dick Sites)
 | |
| //
 | |
| 
 | |
| 
 | |
| #include "getonescriptspan.h"
 | |
| #include <string.h>
 | |
| 
 | |
| #include "fixunicodevalue.h"
 | |
| #include "lang_script.h"
 | |
| #include "port.h"
 | |
| #include "utf8statetable.h"
 | |
| 
 | |
| #include "utf8prop_lettermarkscriptnum.h"
 | |
| #include "utf8repl_lettermarklower.h"
 | |
| #include "utf8scannot_lettermarkspecial.h"
 | |
| 
 | |
| 
 | |
| namespace CLD2 {
 | |
| 
 | |
| // Alphabetical order for binary search, from
 | |
| // generated_entities.cc
 | |
| extern const int kNameToEntitySize;
 | |
| extern const CharIntPair kNameToEntity[];
 | |
| 
 | |
| static const int kMaxUpToWordBoundary = 50;       // span < this make longer,
 | |
|                                                   // else make shorter
 | |
| static const int kMaxAdvanceToWordBoundary = 10;  // +/- this many bytes
 | |
|                                                   // to round to word boundary,
 | |
|                                                   // direction above
 | |
| 
 | |
| static const char kSpecialSymbol[256] = {       // true for < > &
 | |
|   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
 | |
|   0,0,0,0,0,0,1,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,1,0,1,0,
 | |
|   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
 | |
|   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
 | |
| 
 | |
|   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
 | |
|   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
 | |
|   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
 | |
|   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
 | |
| };
 | |
| 
 | |
| 
 | |
| 
 | |
| #define LT 0      // <
 | |
| #define GT 1      // >
 | |
| #define EX 2      // !
 | |
| #define HY 3      // -
 | |
| #define QU 4      // "
 | |
| #define AP 5      // '
 | |
| #define SL 6      // /
 | |
| #define S_ 7
 | |
| #define C_ 8
 | |
| #define R_ 9
 | |
| #define I_ 10
 | |
| #define P_ 11
 | |
| #define T_ 12
 | |
| #define Y_ 13
 | |
| #define L_ 14
 | |
| #define E_ 15
 | |
| #define CR 16     // <cr> or <lf>
 | |
| #define NL 17     // non-letter: ASCII whitespace, digit, punctuation
 | |
| #define PL 18     // possible letter, incl. &
 | |
| #define xx 19     // <unused>
 | |
| 
 | |
| // Map byte to one of ~20 interesting categories for cheap tag parsing
 | |
| static const uint8 kCharToSub[256] = {
 | |
|   NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,CR,NL, NL,CR,NL,NL,
 | |
|   NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
 | |
|   NL,EX,QU,NL, NL,NL,PL,AP, NL,NL,NL,NL, NL,HY,NL,SL,
 | |
|   NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, LT,NL,GT,NL,
 | |
| 
 | |
|   PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL,
 | |
|   P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL,
 | |
|   PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL,
 | |
|   P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL,
 | |
| 
 | |
|   NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
 | |
|   NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
 | |
|   NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
 | |
|   NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
 | |
| 
 | |
|   PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
 | |
|   PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
 | |
|   PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
 | |
|   PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
 | |
| };
 | |
| 
 | |
| #undef LT
 | |
| #undef GT
 | |
| #undef EX
 | |
| #undef HY
 | |
| #undef QU
 | |
| #undef AP
 | |
| #undef SL
 | |
| #undef S_
 | |
| #undef C_
 | |
| #undef R_
 | |
| #undef I_
 | |
| #undef P_
 | |
| #undef T_
 | |
| #undef Y_
 | |
| #undef L_
 | |
| #undef E_
 | |
| #undef CR
 | |
| #undef NL
 | |
| #undef PL
 | |
| #undef xx
 | |
| 
 | |
| 
 | |
| #define OK 0
 | |
| #define X_ 1
 | |
| 
 | |
| 
 | |
| static const int kMaxExitStateLettersMarksOnly = 1;
 | |
| static const int kMaxExitStateAllText = 2;
 | |
| 
 | |
| 
 | |
| // State machine to do cheap parse of non-letter strings incl. tags
 | |
| // advances <tag>
 | |
| //          |    |
 | |
| // advances <tag> ... </tag>  for <script> <style>
 | |
| //          |               |
 | |
| // advances <!-- ... <tag> ... -->
 | |
| //          |                     |
 | |
| // advances <tag
 | |
| //          ||  (0)
 | |
| // advances <tag <tag2>
 | |
| //          ||  (0)
 | |
| //
 | |
| // We start in state [0] at a non-letter and make at least one transition
 | |
| // When scanning for just letters, arriving back at state [0] or [1] exits
 | |
| //   the state machine.
 | |
| // When scanning for any non-tag text, arriving at state [2] also exits
 | |
| static const uint8 kTagParseTbl_0[] = {
 | |
| // <  >  !  -   "  '  /  S   C  R  I  P   T  Y  L  E  CR NL PL xx
 | |
|    3, 2, 2, 2,  2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK,  2, 2,OK,X_, // [0] OK    exit state
 | |
|   X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, // [1] error exit state
 | |
|    3, 2, 2, 2,  2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK,  2, 2,OK,X_, // [2] NL*   [exit state]
 | |
|   X_, 2, 4, 9, 10,11, 9,13,  9, 9, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [3] <
 | |
|   X_, 2, 9, 5, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [4] <!
 | |
|   X_, 2, 9, 6, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [5] <!-
 | |
|    6, 6, 6, 7,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6,X_, // [6] <!--.*
 | |
|    6, 6, 6, 8,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6,X_, // [7] <!--.*-
 | |
|    6, 2, 6, 8,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6,X_, // [8] <!--.*--
 | |
|   X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [9] <.*
 | |
|   10,10,10,10,  9,10,10,10, 10,10,10,10, 10,10,10,10, 12,10,10,X_, // [10] <.*"
 | |
|   11,11,11,11, 11, 9,11,11, 11,11,11,11, 11,11,11,11, 12,11,11,X_, // [11] <.*'
 | |
|   X_, 2,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,X_, // [12] <.* no " '
 | |
| 
 | |
| // <  >  !  -   "  '  /  S   C  R  I  P   T  Y  L  E  CR NL PL xx
 | |
|   X_, 2, 9, 9, 10,11, 9, 9, 14, 9, 9, 9, 28, 9, 9, 9,  9, 9, 9,X_, // [13] <S
 | |
|   X_, 2, 9, 9, 10,11, 9, 9,  9,15, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [14] <SC
 | |
|   X_, 2, 9, 9, 10,11, 9, 9,  9, 9,16, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [15] <SCR
 | |
|   X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9,17,  9, 9, 9, 9,  9, 9, 9,X_, // [16] <SCRI
 | |
|   X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9, 18, 9, 9, 9,  9, 9, 9,X_, // [17] <SCRIP
 | |
|   X_,19, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9, 19,19, 9,X_, // [18] <SCRIPT
 | |
|   20,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [19] <SCRIPT .*
 | |
|   19,19,19,19, 19,19,21,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [20] <SCRIPT .*<
 | |
|   19,19,19,19, 19,19,19,22, 19,19,19,19, 19,19,19,19, 21,21,19,X_, // [21] <SCRIPT .*</ allow SP CR LF
 | |
|   19,19,19,19, 19,19,19,19, 23,19,19,19, 19,19,19,19, 19,19,19,X_, // [22] <SCRIPT .*</S
 | |
|   19,19,19,19, 19,19,19,19, 19,24,19,19, 19,19,19,19, 19,19,19,X_, // [23] <SCRIPT .*</SC
 | |
|   19,19,19,19, 19,19,19,19, 19,19,25,19, 19,19,19,19, 19,19,19,X_, // [24] <SCRIPT .*</SCR
 | |
|   19,19,19,19, 19,19,19,19, 19,19,19,26, 19,19,19,19, 19,19,19,X_, // [25] <SCRIPT .*</SCRI
 | |
|   19,19,19,19, 19,19,19,19, 19,19,19,19, 27,19,19,19, 19,19,19,X_, // [26] <SCRIPT .*</SCRIP
 | |
|   19, 2,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [27] <SCRIPT .*</SCRIPT
 | |
| 
 | |
| // <  >  !  -   "  '  /  S   C  R  I  P   T  Y  L  E  CR NL PL xx
 | |
|   X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9,29, 9, 9,  9, 9, 9,X_, // [28] <ST
 | |
|   X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9,30, 9,  9, 9, 9,X_, // [29] <STY
 | |
|   X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9,31,  9, 9, 9,X_, // [30] <STYL
 | |
|   X_,32, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9, 32,32, 9,X_, // [31] <STYLE
 | |
|   33,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [32] <STYLE .*
 | |
|   32,32,32,32, 32,32,34,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [33] <STYLE .*<
 | |
|   32,32,32,32, 32,32,32,35, 32,32,32,32, 32,32,32,32, 34,34,32,X_, // [34] <STYLE .*</ allow SP CR LF
 | |
|   32,32,32,32, 32,32,32,32, 32,32,32,32, 36,32,32,32, 32,32,32,X_, // [35] <STYLE .*</S
 | |
|   32,32,32,32, 32,32,32,32, 32,32,32,32, 32,37,32,32, 32,32,32,X_, // [36] <STYLE .*</ST
 | |
|   32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,38,32, 32,32,32,X_, // [37] <STYLE .*</STY
 | |
|   32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,39, 32,32,32,X_, // [38] <STYLE .*</STYL
 | |
|   32, 2,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [39] <STYLE .*</STYLE
 | |
| };
 | |
| 
 | |
| #undef OK
 | |
| #undef X_
 | |
| 
 | |
| enum
 | |
| {
 | |
|   UTFmax        = 4,            // maximum bytes per rune
 | |
|   Runesync      = 0x80,         // cannot represent part of a UTF sequence (<)
 | |
|   Runeself      = 0x80,         // rune and UTF sequences are the same (<)
 | |
|   Runeerror     = 0xFFFD,       // decoding error in UTF
 | |
|   Runemax       = 0x10FFFF,     // maximum rune value
 | |
| };
 | |
| 
 | |
| // Debugging. Not thread safe.
 | |
| static char gDisplayPiece[32];
 | |
| const uint8 gCharlen[16] = {1,1,1,1, 1,1,1,1, 1,1,1,1, 2,2,3,4};
 | |
| char* DisplayPiece(const char* next_byte_, int byte_length_) {
 | |
|   // Copy up to 8 UTF-8 chars to buffer
 | |
|   int k = 0;    // byte count
 | |
|   int n = 0;    // character count
 | |
|   for (int i = 0; i < byte_length_; ++i) {
 | |
|     char c = next_byte_[i];
 | |
|     if ((c & 0xc0) != 0x80) {
 | |
|       // Beginning of a UTF-8 character
 | |
|       int charlen = gCharlen[static_cast<uint8>(c) >> 4];
 | |
|       if (i + charlen > byte_length_) {break;} // Not enough room for full char
 | |
|       if (k >= (32 - 7)) {break;}   // Not necessarily enough room
 | |
|       if (n >= 8) {break;}          // Enough characters already
 | |
|       ++n;
 | |
|     }
 | |
|     if (c == '<') {
 | |
|       memcpy(&gDisplayPiece[k], "<", 4); k += 4;
 | |
|     } else if (c == '>') {
 | |
|       memcpy(&gDisplayPiece[k], ">", 4); k += 4;
 | |
|     } else if (c == '&') {
 | |
|       memcpy(&gDisplayPiece[k], "&", 5); k += 5;
 | |
|     } else if (c == '\'') {
 | |
|       memcpy(&gDisplayPiece[k], "'", 6); k += 6;
 | |
|     } else if (c == '"') {
 | |
|       memcpy(&gDisplayPiece[k], """, 6); k += 6;
 | |
|     } else {
 | |
|       gDisplayPiece[k++] = c;
 | |
|     }
 | |
|   }
 | |
|   gDisplayPiece[k++] = '\0';
 | |
|   return gDisplayPiece;
 | |
| }
 | |
| 
 | |
| 
 | |
| 
 | |
| // runetochar copies (encodes) one rune, pointed to by r, to at most
 | |
| // UTFmax bytes starting at s and returns the number of bytes generated.
 | |
| int runetochar(char *str, const char32 *rune) {
 | |
|   // Convert to unsigned for range check.
 | |
|   unsigned long c;
 | |
| 
 | |
|   // 1 char 00-7F
 | |
|   c = *rune;
 | |
|   if(c <= 0x7F) {
 | |
|     str[0] = c;
 | |
|     return 1;
 | |
|   }
 | |
| 
 | |
|   // 2 char 0080-07FF
 | |
|   if(c <= 0x07FF) {
 | |
|     str[0] = 0xC0 | (c >> 1*6);
 | |
|     str[1] = 0x80 | (c & 0x3F);
 | |
|     return 2;
 | |
|   }
 | |
| 
 | |
|   // Range check
 | |
|   if (c > Runemax) {
 | |
|     c = Runeerror;
 | |
|   }
 | |
| 
 | |
|   // 3 char 0800-FFFF
 | |
|   if (c <= 0xFFFF) {
 | |
|     str[0] = 0xE0 |  (c >> 2*6);
 | |
|     str[1] = 0x80 | ((c >> 1*6) & 0x3F);
 | |
|     str[2] = 0x80 |  (c & 0x3F);
 | |
|     return 3;
 | |
|   }
 | |
| 
 | |
|   // 4 char 10000-1FFFFF
 | |
|   str[0] = 0xF0 | (c >> 3*6);
 | |
|   str[1] = 0x80 | ((c >> 2*6) & 0x3F);
 | |
|   str[2] = 0x80 | ((c >> 1*6) & 0x3F);
 | |
|   str[3] = 0x80 | (c & 0x3F);
 | |
|   return 4;
 | |
| }
 | |
| 
 | |
| 
 | |
| 
 | |
| // Useful for converting an entity to an ascii value.
 | |
| // RETURNS unicode value, or -1 if entity isn't valid.  Don't include & or ;
 | |
| int LookupEntity(const char* entity_name, int entity_len) {
 | |
|   // Make a C string
 | |
|   if (entity_len >= 16) {return -1;}    // All real entities are shorter
 | |
|   char temp[16];
 | |
|   memcpy(temp, entity_name, entity_len);
 | |
|   temp[entity_len] = '\0';
 | |
|   int match = BinarySearch(temp, 0, kNameToEntitySize, kNameToEntity);
 | |
|   if (match >= 0) {return kNameToEntity[match].i;}
 | |
|   return -1;
 | |
| }
 | |
| 
 | |
| bool ascii_isdigit(char c) {
 | |
|   return ('0' <= c) && (c <= '9');
 | |
| }
 | |
| bool ascii_isxdigit(char c) {
 | |
|   if (('0' <= c) && (c <= '9')) {return true;}
 | |
|   if (('a' <= c) && (c <= 'f')) {return true;}
 | |
|   if (('A' <= c) && (c <= 'F')) {return true;}
 | |
|   return false;
 | |
| }
 | |
| bool ascii_isalnum(char c) {
 | |
|   if (('0' <= c) && (c <= '9')) {return true;}
 | |
|   if (('a' <= c) && (c <= 'z')) {return true;}
 | |
|   if (('A' <= c) && (c <= 'Z')) {return true;}
 | |
|   return false;
 | |
| }
 | |
| int hex_digit_to_int(char c) {
 | |
|   if (('0' <= c) && (c <= '9')) {return c - '0';}
 | |
|   if (('a' <= c) && (c <= 'f')) {return c - 'a' + 10;}
 | |
|   if (('A' <= c) && (c <= 'F')) {return c - 'A' + 10;}
 | |
|   return 0;
 | |
| }
 | |
| 
 | |
| static int32 strto32_base10(const char* nptr, const char* limit,
 | |
|                             const char **endptr) {
 | |
|   *endptr = nptr;
 | |
|   while (nptr < limit && *nptr == '0') {
 | |
|     ++nptr;
 | |
|   }
 | |
|   if (nptr == limit || !ascii_isdigit(*nptr))
 | |
|     return -1;
 | |
|   const char* end_digits_run = nptr;
 | |
|   while (end_digits_run < limit && ascii_isdigit(*end_digits_run)) {
 | |
|     ++end_digits_run;
 | |
|   }
 | |
|   *endptr = end_digits_run;
 | |
|   const int num_digits = end_digits_run - nptr;
 | |
|   // kint32max == 2147483647.
 | |
|   if (num_digits < 9 ||
 | |
|       (num_digits == 10 && memcmp(nptr, "2147483647", 10) <= 0)) {
 | |
|     int value = 0;
 | |
|     for (; nptr < end_digits_run; ++nptr) {
 | |
|       value *= 10;
 | |
|       value += *nptr - '0';
 | |
|     }
 | |
|     // Overflow past the last valid unicode codepoint
 | |
|     // (0x10ffff) is converted to U+FFFD by FixUnicodeValue().
 | |
|     return FixUnicodeValue(value);
 | |
|   } else {
 | |
|     // Overflow: can't fit in an int32;
 | |
|     // returns the replacement character 0xFFFD.
 | |
|     return 0xFFFD;
 | |
|   }
 | |
| }
 | |
| 
 | |
| static int32 strto32_base16(const char* nptr, const char* limit,
 | |
|                             const char **endptr) {
 | |
|   *endptr = nptr;
 | |
|   while (nptr < limit && *nptr == '0') {
 | |
|     ++nptr;
 | |
|   }
 | |
|   if (nptr == limit || !ascii_isxdigit(*nptr)) {
 | |
|     return -1;
 | |
|   }
 | |
|   const char* end_xdigits_run = nptr;
 | |
|   while (end_xdigits_run < limit && ascii_isxdigit(*end_xdigits_run)) {
 | |
|     ++end_xdigits_run;
 | |
|   }
 | |
|   *endptr = end_xdigits_run;
 | |
|   const int num_xdigits = end_xdigits_run - nptr;
 | |
|   // kint32max == 0x7FFFFFFF.
 | |
|   if (num_xdigits < 8 || (num_xdigits == 8 && nptr[0] < '8')) {
 | |
|     int value = 0;
 | |
|     for (; nptr < end_xdigits_run; ++nptr) {
 | |
|       value <<= 4;
 | |
|       value += hex_digit_to_int(*nptr);
 | |
|     }
 | |
|     // Overflow past the last valid unicode codepoint
 | |
|     // (0x10ffff) is converted to U+FFFD by FixUnicodeValue().
 | |
|     return FixUnicodeValue(value);
 | |
|   } else {
 | |
|     // Overflow: can't fit in an int32;
 | |
|     // returns the replacement character 0xFFFD.
 | |
|     return 0xFFFD;
 | |
|   }
 | |
| }
 | |
| 
 | |
| // Unescape the current character pointed to by src.  SETS the number
 | |
| // of chars read for the conversion (in UTF8).  If src isn't a valid entity,
 | |
| // just consume the & and RETURN -1.  If src doesn't point to & -- which it
 | |
| // should -- set src_consumed to 0 and RETURN -1.
 | |
| int ReadEntity(const char* src, int srcn, int* src_consumed) {
 | |
|   const char* const srcend = src + srcn;
 | |
| 
 | |
|   if (srcn == 0 || *src != '&') {      // input should start with an ampersand
 | |
|     *src_consumed = 0;
 | |
|     return -1;
 | |
|   }
 | |
|   *src_consumed = 1;                   // we'll get the & at least
 | |
| 
 | |
|   // The standards are a bit unclear on when an entity ends.  Certainly a ";"
 | |
|   // ends one, but spaces probably do too.  We follow the lead of both IE and
 | |
|   // Netscape, which as far as we can tell end numeric entities (1st case below)
 | |
|   // at any non-digit, and end character entities (2nd case) at any non-alnum.
 | |
|   const char* entstart, *entend;  // where the entity starts and ends
 | |
|   entstart = src + 1;             // read past the &
 | |
|   int entval;                     // UCS2 value of the entity
 | |
|   if ( *entstart == '#' ) {       // -- 1st case: numeric entity
 | |
|     if ( entstart + 2 >= srcend ) {
 | |
|       return -1;                  // no way a legitimate number could fit
 | |
|     } else if ( entstart[1] == 'x' || entstart[1] == 'X' ) {   // hex numeric
 | |
|       entval = strto32_base16(entstart + 2, srcend, &entend);
 | |
|     } else {                                  // decimal numeric entity
 | |
|       entval = strto32_base10(entstart+1, srcend, &entend);
 | |
|     }
 | |
|     if (entval == -1 || entend > srcend) {
 | |
|       return -1;                 // not entirely correct, but close enough
 | |
|     }
 | |
|   } else {                       // -- 2nd case: character entity
 | |
|     for (entend = entstart;
 | |
|          entend < srcend && ascii_isalnum(*entend);
 | |
|          ++entend ) {
 | |
|       // entity consists of alphanumeric chars
 | |
|     }
 | |
|     entval = LookupEntity(entstart, entend - entstart);
 | |
|     if (entval < 0) {
 | |
|       return -1;  // not a legal entity name
 | |
|     }
 | |
|     // Now we do a strange-seeming IE6-compatibility check: if entval is
 | |
|     // >= 256, it *must* be followed by a semicolon or it's not considered
 | |
|     // an entity.  The problem is lots of the newfangled entity names, like
 | |
|     // "lang", also occur in URL CGI arguments: "/search?q=test&lang=en".
 | |
|     // When these links are written in HTML, it would be really bad if the
 | |
|     // "&lang" were treated as an entity, which is what the spec says
 | |
|     // *should* happen (even when the HTML is inside an "A HREF" tag!)
 | |
|     // IE ignores the spec for these new, high-value entities, so we do too.
 | |
|     if ( entval >= 256 && !(entend < srcend && *entend == ';') ) {
 | |
|       return -1;                 // make non-;-terminated entity illegal
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   // Finally, figure out how much src was consumed
 | |
|   if ( entend < srcend && *entend == ';' ) {
 | |
|     entend++;                    // standard says ; terminator is special
 | |
|   }
 | |
|   *src_consumed = entend - src;
 | |
|   return entval;
 | |
| }
 | |
| 
 | |
| 
 | |
| // Src points to '&'
 | |
| // Writes entity value to dst. Returns take(src), put(dst) byte counts
 | |
| void EntityToBuffer(const char* src, int len, char* dst,
 | |
|                     int* tlen, int* plen) {
 | |
|   char32 entval = ReadEntity(src, len, tlen);
 | |
| 
 | |
|   // ReadEntity does this already: entval = FixUnicodeValue(entval);
 | |
| 
 | |
|   // Convert UTF-32 to UTF-8
 | |
|   if (entval > 0) {
 | |
|     *plen = runetochar(dst, &entval);
 | |
|   } else {
 | |
|     // Illegal entity; ignore the '&'
 | |
|     *tlen = 1;
 | |
|     *plen = 0;
 | |
|   }
 | |
| }
 | |
| 
 | |
| // Returns true if character is < > or &, none of which are letters
 | |
| bool inline IsSpecial(char c) {
 | |
|   if ((c & 0xe0) == 0x20) {
 | |
|     return kSpecialSymbol[static_cast<uint8>(c)];
 | |
|   }
 | |
|   return false;
 | |
| }
 | |
| 
 | |
| // Quick Skip to next letter or < > & or to end of string (eos)
 | |
| // Always return is_letter for eos
 | |
| int ScanToLetterOrSpecial(const char* src, int len) {
 | |
|   int bytes_consumed;
 | |
|   StringPiece str(src, len);
 | |
|   UTF8GenericScan(&utf8scannot_lettermarkspecial_obj, str, &bytes_consumed);
 | |
|   return bytes_consumed;
 | |
| }
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| // src points to non-letter, such as tag-opening '<'
 | |
| // Return length from here to next possible letter
 | |
| // On another < before >, return 1
 | |
| // advances <tag>
 | |
| //          |    |
 | |
| // advances <tag> ... </tag>  for <script> <style>
 | |
| //          |               |
 | |
| // advances <!-- ... <tag> ... -->
 | |
| //          |                     |
 | |
| // advances <tag
 | |
| //          |    | end of string
 | |
| // advances <tag <tag2>
 | |
| //          ||
 | |
| int ScanToPossibleLetter(const char* isrc, int len, int max_exit_state) {
 | |
|   const uint8* src = reinterpret_cast<const uint8*>(isrc);
 | |
|   const uint8* srclimit = src + len;
 | |
|   const uint8* tagParseTbl = kTagParseTbl_0;
 | |
|   int e = 0;
 | |
|   while (src < srclimit) {
 | |
|     e = tagParseTbl[kCharToSub[*src++]];
 | |
|     if (e <= max_exit_state) {
 | |
|       // We overshot by one byte
 | |
|       --src;
 | |
|       break;
 | |
|     }
 | |
|     tagParseTbl = &kTagParseTbl_0[e * 20];
 | |
|   }
 | |
| 
 | |
|   if (src >= srclimit) {
 | |
|     // We fell off the end of the text.
 | |
|     // It looks like the most common case for this is a truncated file, not
 | |
|     // mismatched angle brackets. So we pretend that the last char was '>'
 | |
|     return len;
 | |
|   }
 | |
| 
 | |
|   // OK to be in state 0 or state 2 at exit
 | |
|   if ((e != 0) && (e != 2)) {
 | |
|     // Error, '<' followed by '<'
 | |
|     // We want to back up to first <, then advance by one byte past it
 | |
|     int offset = src - reinterpret_cast<const uint8*>(isrc);
 | |
| 
 | |
|     // Backscan to first '<' and return enough length to just get past it
 | |
|     --offset;   // back up over the second '<', which caused us to stop
 | |
|     while ((0 < offset) && (isrc[offset] != '<')) {
 | |
|       // Find the first '<', which is unmatched
 | |
|       --offset;
 | |
|     }
 | |
|     // skip to just beyond first '<'
 | |
|     return offset + 1;
 | |
|   }
 | |
| 
 | |
|   return src - reinterpret_cast<const uint8*>(isrc);
 | |
| }
 | |
| 
 | |
| 
 | |
| ScriptScanner::ScriptScanner(const char* buffer,
 | |
|                              int buffer_length,
 | |
|                              bool is_plain_text)
 | |
|   : start_byte_(buffer),
 | |
|   next_byte_(buffer),
 | |
|   next_byte_limit_(buffer + buffer_length),
 | |
|   byte_length_(buffer_length),
 | |
|   is_plain_text_(is_plain_text),
 | |
|   letters_marks_only_(true),
 | |
|   one_script_only_(true),
 | |
|   exit_state_(kMaxExitStateLettersMarksOnly) {
 | |
|     script_buffer_ = new char[kMaxScriptBuffer];
 | |
|     script_buffer_lower_ = new char[kMaxScriptLowerBuffer];
 | |
|     map2original_.Clear();    // map from script_buffer_ to buffer
 | |
|     map2uplow_.Clear();       // map from script_buffer_lower_ to script_buffer_
 | |
| }
 | |
| 
 | |
| // Extended version to allow spans of any non-tag text and spans of mixed script
 | |
| ScriptScanner::ScriptScanner(const char* buffer,
 | |
|                              int buffer_length,
 | |
|                              bool is_plain_text,
 | |
|                              bool any_text,
 | |
|                              bool any_script)
 | |
|   : start_byte_(buffer),
 | |
|   next_byte_(buffer),
 | |
|   next_byte_limit_(buffer + buffer_length),
 | |
|   byte_length_(buffer_length),
 | |
|   is_plain_text_(is_plain_text),
 | |
|   letters_marks_only_(!any_text),
 | |
|   one_script_only_(!any_script),
 | |
|   exit_state_(any_text ? kMaxExitStateAllText : kMaxExitStateLettersMarksOnly) {
 | |
|     script_buffer_ = new char[kMaxScriptBuffer];
 | |
|     script_buffer_lower_ = new char[kMaxScriptLowerBuffer];
 | |
|     map2original_.Clear();    // map from script_buffer_ to buffer
 | |
|     map2uplow_.Clear();       // map from script_buffer_lower_ to script_buffer_
 | |
| }
 | |
| 
 | |
| 
 | |
| ScriptScanner::~ScriptScanner() {
 | |
|   delete[] script_buffer_;
 | |
|   delete[] script_buffer_lower_;
 | |
| }
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| // Get to the first real non-tag letter or entity that is a letter
 | |
| // Sets script of that letter
 | |
| // Return len if no more letters
 | |
| int ScriptScanner::SkipToFrontOfSpan(const char* src, int len, int* script) {
 | |
|   int sc = UNKNOWN_ULSCRIPT;
 | |
|   int skip = 0;
 | |
|   int tlen, plen;
 | |
| 
 | |
|   // Do run of non-letters (tag | &NL | NL)*
 | |
|   tlen = 0;
 | |
|   while (skip < len) {
 | |
|     // Do fast scan to next interesting byte
 | |
|     // int oldskip = skip;
 | |
|     skip += ScanToLetterOrSpecial(src + skip, len - skip);
 | |
| 
 | |
|     // Check for no more letters/specials
 | |
|     if (skip >= len) {
 | |
|       // All done
 | |
|       *script = sc;
 | |
|       return len;
 | |
|     }
 | |
| 
 | |
|     // We are at a letter, nonletter, tag, or entity
 | |
|     if (IsSpecial(src[skip]) && !is_plain_text_) {
 | |
|       if (src[skip] == '<') {
 | |
|         // Begining of tag; skip to end and go around again
 | |
|         tlen = ScanToPossibleLetter(src + skip, len - skip,
 | |
|                                     exit_state_);
 | |
|         sc = 0;
 | |
|       } else if (src[skip] == '>') {
 | |
|         // Unexpected end of tag; skip it and go around again
 | |
|         tlen = 1;         // Over the >
 | |
|         sc = 0;
 | |
|       } else if (src[skip] == '&') {
 | |
|         // Expand entity, no advance
 | |
|         char temp[4];
 | |
|         EntityToBuffer(src + skip, len - skip,
 | |
|                        temp, &tlen, &plen);
 | |
|         sc = GetUTF8LetterScriptNum(temp);
 | |
|       }
 | |
|     } else {
 | |
|       // Update 1..4 bytes
 | |
|       tlen = UTF8OneCharLen(src + skip);
 | |
|       sc = GetUTF8LetterScriptNum(src + skip);
 | |
|     }
 | |
|     if (sc != 0) {break;}           // Letter found
 | |
|     skip += tlen;                   // Else advance
 | |
|   }
 | |
| 
 | |
|   *script = sc;
 | |
|   return skip;
 | |
| }
 | |
| 
 | |
| 
 | |
| // These are for ASCII-only tag names
 | |
| // Compare one letter uplow to c, ignoring case of uplowp
 | |
| inline bool EqCase(char uplow, char c) {
 | |
|   return (uplow | 0x20) == c;
 | |
| }
 | |
| 
 | |
| // These are for ASCII-only tag names
 | |
| // Return true for space / < > etc. all less than 0x40
 | |
| inline bool NeqLetter(char c) {
 | |
|   return c < 0x40;
 | |
| }
 | |
| 
 | |
| // These are for ASCII-only tag names
 | |
| // Return true for space \n false for \r
 | |
| inline bool WS(char c) {
 | |
|   return (c == ' ') || (c == '\n');
 | |
| }
 | |
| 
 | |
| // Canonical CR or LF
 | |
| static const char LF = '\n';
 | |
| 
 | |
| 
 | |
| // The naive loop scans from next_byte_ to script_buffer_ until full.
 | |
| // But this can leave an awkward hard-to-identify short fragment at the
 | |
| // end of the input. We would prefer to make the next-to-last fragment
 | |
| // shorter and the last fragment longer.
 | |
| 
 | |
| // Copy next run of non-tag characters to buffer [NUL terminated]
 | |
| // This just replaces tags with space or \n and removes entities.
 | |
| // Tags <br> <p> and <tr> are replaced with \n. Non-letter sequences
 | |
| // including \r or \n are replaced by \n. All other tags and skipped text
 | |
| // are replaced with ASCII space.
 | |
| //
 | |
| // Buffer ALWAYS has leading space and trailing space space space NUL
 | |
| bool ScriptScanner::GetOneTextSpan(LangSpan* span) {
 | |
|   span->text = script_buffer_;
 | |
|   span->text_bytes = 0;
 | |
|   span->offset = next_byte_ - start_byte_;
 | |
|   span->ulscript = UNKNOWN_ULSCRIPT;
 | |
|   span->lang = UNKNOWN_LANGUAGE;
 | |
|   span->truncated = false;
 | |
| 
 | |
|   int put_soft_limit = kMaxScriptBytes - kWithinScriptTail;
 | |
|   if ((kMaxScriptBytes <= byte_length_) &&
 | |
|       (byte_length_ < (2 * kMaxScriptBytes))) {
 | |
|     // Try to split the last two fragments in half
 | |
|     put_soft_limit = byte_length_ / 2;
 | |
|   }
 | |
| 
 | |
|   script_buffer_[0] = ' ';  // Always a space at front of output
 | |
|   script_buffer_[1] = '\0';
 | |
|   int take = 0;
 | |
|   int put = 1;              // Start after the initial space
 | |
|   int tlen, plen;
 | |
| 
 | |
|   if (byte_length_ <= 0) {
 | |
|     return false;          // No more text to be found
 | |
|   }
 | |
| 
 | |
|   // Go over alternating spans of text and tags,
 | |
|   // copying letters to buffer with single spaces for each run of non-letters
 | |
|   bool last_byte_was_space = false;
 | |
|   while (take < byte_length_) {
 | |
|     char c = next_byte_[take];
 | |
|     if (c == '\r') {c = LF;}      // Canonical CR or LF
 | |
|     if (c == '\n') {c = LF;}      // Canonical CR or LF
 | |
| 
 | |
|     if (IsSpecial(c) && !is_plain_text_) {
 | |
|       if (c == '<') {
 | |
|         // Replace tag with space
 | |
|         c = ' ';                      // for almost-full test below
 | |
|         // or if <p> <br> <tr>, replace with \n
 | |
|         if (take < (byte_length_ - 3)) {
 | |
|           if (EqCase(next_byte_[take + 1], 'p') &&
 | |
|               NeqLetter(next_byte_[take + 2])) {
 | |
|             c = LF;
 | |
|           }
 | |
|           if (EqCase(next_byte_[take + 1], 'b') &&
 | |
|               EqCase(next_byte_[take + 2], 'r') &&
 | |
|               NeqLetter(next_byte_[take + 3])) {
 | |
|             c = LF;
 | |
|           }
 | |
|           if (EqCase(next_byte_[take + 1], 't') &&
 | |
|               EqCase(next_byte_[take + 2], 'r') &&
 | |
|               NeqLetter(next_byte_[take + 3])) {
 | |
|             c = LF;
 | |
|           }
 | |
|         }
 | |
|         // Begining of tag; skip to end and go around again
 | |
|         tlen = 1 + ScanToPossibleLetter(next_byte_ + take, byte_length_ - take,
 | |
|                                     exit_state_);
 | |
|         // Copy one byte, compressing spaces
 | |
|         if (!last_byte_was_space || !WS(c)) {
 | |
|           script_buffer_[put++] = c;      // Advance dest
 | |
|           last_byte_was_space = WS(c);
 | |
|         }
 | |
|       } else if (c == '>') {
 | |
|         // Unexpected end of tag; copy it and go around again
 | |
|         tlen = 1;         // Over the >
 | |
|         script_buffer_[put++] = c;    // Advance dest
 | |
|       } else if (c == '&') {
 | |
|         // Expand entity, no advance
 | |
|         EntityToBuffer(next_byte_ + take, byte_length_ - take,
 | |
|                        script_buffer_ + put, &tlen, &plen);
 | |
|         put += plen;                  // Advance dest
 | |
|       }
 | |
|       take += tlen;                   // Advance source
 | |
|     } else {
 | |
|       // Copy one byte, compressing spaces
 | |
|       if (!last_byte_was_space || !WS(c)) {
 | |
|         script_buffer_[put++] = c;      // Advance dest
 | |
|         last_byte_was_space = WS(c);
 | |
|       }
 | |
|       ++take;                         // Advance source
 | |
|     }
 | |
| 
 | |
|     if (WS(c) &&
 | |
|         (put >= put_soft_limit)) {
 | |
|       // Buffer is almost full
 | |
|       span->truncated = true;
 | |
|       break;
 | |
|     }
 | |
|     if (put >= kMaxScriptBytes) {
 | |
|       // Buffer is completely full
 | |
|       span->truncated = true;
 | |
|       break;
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   // Almost done. Back up to a character boundary if needed
 | |
|   while ((0 < take) && ((next_byte_[take] & 0xc0) == 0x80)) {
 | |
|     // Back up over continuation byte
 | |
|     --take;
 | |
|     --put;
 | |
|   }
 | |
| 
 | |
|   // Update input position
 | |
|   next_byte_ += take;
 | |
|   byte_length_ -= take;
 | |
| 
 | |
|   // Put four more spaces/NUL. Worst case is abcd _ _ _ \0
 | |
|   //                          kMaxScriptBytes |   | put
 | |
|   script_buffer_[put + 0] = ' ';
 | |
|   script_buffer_[put + 1] = ' ';
 | |
|   script_buffer_[put + 2] = ' ';
 | |
|   script_buffer_[put + 3] = '\0';
 | |
| 
 | |
|   span->text_bytes = put;       // Does not include the last four chars above
 | |
|   return true;
 | |
| }
 | |
| 
 | |
| 
 | |
| // Copy next run of same-script non-tag letters to buffer [NUL terminated]
 | |
| // Buffer ALWAYS has leading space and trailing space space space NUL
 | |
| bool ScriptScanner::GetOneScriptSpan(LangSpan* span) {
 | |
|   if (!letters_marks_only_) {
 | |
|     // Return non-tag text, including punctuation and digits
 | |
|     return GetOneTextSpan(span);
 | |
|   }
 | |
| 
 | |
|   span->text = script_buffer_;
 | |
|   span->text_bytes = 0;
 | |
|   span->offset = next_byte_ - start_byte_;
 | |
|   span->ulscript = UNKNOWN_ULSCRIPT;
 | |
|   span->lang = UNKNOWN_LANGUAGE;
 | |
|   span->truncated = false;
 | |
| 
 | |
|   // struct timeval script_start, script_mid, script_end;
 | |
| 
 | |
|   int put_soft_limit = kMaxScriptBytes - kWithinScriptTail;
 | |
|   if ((kMaxScriptBytes <= byte_length_) &&
 | |
|       (byte_length_ < (2 * kMaxScriptBytes))) {
 | |
|     // Try to split the last two fragments in half
 | |
|     put_soft_limit = byte_length_ / 2;
 | |
|   }
 | |
| 
 | |
| 
 | |
|   int spanscript;           // The script of this span
 | |
|   int sc = UNKNOWN_ULSCRIPT;  // The script of next character
 | |
|   int tlen = 0;
 | |
|   int plen = 0;
 | |
| 
 | |
|   script_buffer_[0] = ' ';  // Always a space at front of output
 | |
|   script_buffer_[1] = '\0';
 | |
|   int take = 0;
 | |
|   int put = 1;              // Start after the initial space
 | |
| 
 | |
|   // Build offsets from span->text back to start_byte_ + span->offset
 | |
|   // This mapping reflects deletion of non-letters, expansion of
 | |
|   // entities, etc.
 | |
|   map2original_.Clear();
 | |
|   map2original_.Delete(span->offset);   // So that MapBack(0) gives offset
 | |
| 
 | |
|   // Get to the first real non-tag letter or entity that is a letter
 | |
|   int skip = SkipToFrontOfSpan(next_byte_, byte_length_, &spanscript);
 | |
|   next_byte_ += skip;
 | |
|   byte_length_ -= skip;
 | |
| 
 | |
|   if (skip != 1) {
 | |
|     map2original_.Delete(skip);
 | |
|     map2original_.Insert(1);
 | |
|   } else {
 | |
|     map2original_.Copy(1);
 | |
|   }
 | |
|   if (byte_length_ <= 0) {
 | |
|     map2original_.Reset();
 | |
|     return false;               // No more letters to be found
 | |
|   }
 | |
| 
 | |
|   // There is at least one letter, so we know the script for this span
 | |
|   span->ulscript = (ULScript)spanscript;
 | |
| 
 | |
| 
 | |
|   // Go over alternating spans of same-script letters and non-letters,
 | |
|   // copying letters to buffer with single spaces for each run of non-letters
 | |
|   while (take < byte_length_) {
 | |
|     // Copy run of letters in same script (&LS | LS)*
 | |
|     int letter_count = 0;              // Keep track of word length
 | |
|     bool need_break = false;
 | |
| 
 | |
|     while (take < byte_length_) {
 | |
|       // We are at a letter, nonletter, tag, or entity
 | |
|       if (IsSpecial(next_byte_[take]) && !is_plain_text_) {
 | |
|         if (next_byte_[take] == '<') {
 | |
|           // Begining of tag
 | |
|           sc = 0;
 | |
|           break;
 | |
|         } else if (next_byte_[take] == '>') {
 | |
|           // Unexpected end of tag
 | |
|           sc = 0;
 | |
|           break;
 | |
|         } else if (next_byte_[take] == '&') {
 | |
|           // Copy entity, no advance
 | |
|           EntityToBuffer(next_byte_ + take, byte_length_ - take,
 | |
|                          script_buffer_ + put, &tlen, &plen);
 | |
|           sc = GetUTF8LetterScriptNum(script_buffer_ + put);
 | |
|         }
 | |
|       } else {
 | |
|         // Real letter, safely copy up to 4 bytes, increment by 1..4
 | |
|         // Will update by 1..4 bytes at Advance, below
 | |
|         tlen = plen = UTF8OneCharLen(next_byte_ + take);
 | |
|         if (take < (byte_length_ - 3)) {
 | |
|           // X86 fast case, does unaligned load/store
 | |
|           UNALIGNED_STORE32(script_buffer_ + put,
 | |
|                             UNALIGNED_LOAD32(next_byte_ + take));
 | |
| 
 | |
|         } else {
 | |
|           // Slow case, happens 1-3 times per input document
 | |
|           memcpy(script_buffer_ + put, next_byte_ + take, plen);
 | |
|         }
 | |
|         sc = GetUTF8LetterScriptNum(next_byte_ + take);
 | |
|       }
 | |
| 
 | |
|       // Allow continue across a single letter in a different script:
 | |
|       // A B D = three scripts, c = common script, i = inherited script,
 | |
|       // - = don't care, ( = take position before the += below
 | |
|       //  AAA(A-    continue
 | |
|       //
 | |
|       //  AAA(BA    continue
 | |
|       //  AAA(BB    break
 | |
|       //  AAA(Bc    continue (breaks after B)
 | |
|       //  AAA(BD    break
 | |
|       //  AAA(Bi    break
 | |
|       //
 | |
|       //  AAA(c-    break
 | |
|       //
 | |
|       //  AAA(i-    continue
 | |
|       //
 | |
| 
 | |
|       if ((sc != spanscript) && (sc != ULScript_Inherited)) {
 | |
|         // Might need to break this script span
 | |
|         if (sc == ULScript_Common) {
 | |
|           need_break = true;
 | |
|         } else {
 | |
|           // Look at next following character, ignoring entity as Common
 | |
|           int sc2 = GetUTF8LetterScriptNum(next_byte_ + take + tlen);
 | |
|           if ((sc2 != ULScript_Common) && (sc2 != spanscript)) {
 | |
|             // We found a non-trivial change of script
 | |
|             if (one_script_only_) {
 | |
|               need_break = true;
 | |
|             }
 | |
|           }
 | |
|         }
 | |
|       }
 | |
|       if (need_break) {break;}  // Non-letter or letter in wrong script
 | |
| 
 | |
|       take += tlen;                   // Advance
 | |
|       put += plen;                    // Advance
 | |
| 
 | |
|       // Update the offset map to reflect take/put lengths
 | |
|       if (tlen == plen) {
 | |
|         map2original_.Copy(tlen);
 | |
|       } else if (tlen < plen) {
 | |
|         map2original_.Copy(tlen);
 | |
|         map2original_.Insert(plen - tlen);
 | |
|       } else {    // plen < tlen
 | |
|         map2original_.Copy(plen);
 | |
|         map2original_.Delete(tlen - plen);
 | |
|       }
 | |
| 
 | |
|       ++letter_count;
 | |
|       if (put >= kMaxScriptBytes) {
 | |
|         // Buffer is full
 | |
|         span->truncated = true;
 | |
|         break;
 | |
|       }
 | |
|     }     // End while letters
 | |
| 
 | |
|     // Do run of non-letters (tag | &NL | NL)*
 | |
|     while (take < byte_length_) {
 | |
|       // Do fast scan to next interesting byte
 | |
|       tlen = ScanToLetterOrSpecial(next_byte_ + take, byte_length_ - take);
 | |
|       take += tlen;
 | |
|       map2original_.Delete(tlen);
 | |
|       if (take >= byte_length_) {break;}    // Might have scanned to end
 | |
| 
 | |
|       // We are at a letter, nonletter, tag, or entity
 | |
|       if (IsSpecial(next_byte_[take]) && !is_plain_text_) {
 | |
|         if (next_byte_[take] == '<') {
 | |
|           // Begining of tag; skip to end and go around again
 | |
|           tlen = ScanToPossibleLetter(next_byte_ + take, byte_length_ - take,
 | |
|                                       exit_state_);
 | |
|           sc = 0;
 | |
|         } else if (next_byte_[take] == '>') {
 | |
|           // Unexpected end of tag; skip it and go around again
 | |
|           tlen = 1;         // Over the >
 | |
|           sc = 0;
 | |
|         } else if (next_byte_[take] == '&') {
 | |
|           // Expand entity, no advance
 | |
|           EntityToBuffer(next_byte_ + take, byte_length_ - take,
 | |
|                          script_buffer_ + put, &tlen, &plen);
 | |
|           sc = GetUTF8LetterScriptNum(script_buffer_ + put);
 | |
|         }
 | |
|       } else {
 | |
|         // Update 1..4
 | |
|         tlen = UTF8OneCharLen(next_byte_ + take);
 | |
|         sc = GetUTF8LetterScriptNum(next_byte_ + take);
 | |
|       }
 | |
|       if (sc != 0) {break;}           // Letter found
 | |
|       take += tlen;                   // Else advance
 | |
|       map2original_.Delete(tlen);
 | |
|     }     // End while not-letters
 | |
| 
 | |
|     script_buffer_[put++] = ' ';
 | |
|     map2original_.Insert(1);
 | |
| 
 | |
|     // Letter in wrong script ?
 | |
|     if ((sc != spanscript) && (sc != ULScript_Inherited)) {break;}
 | |
|     if (put >= put_soft_limit) {
 | |
|       // Buffer is almost full
 | |
|       span->truncated = true;
 | |
|       break;
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   // Almost done. Back up to a character boundary if needed
 | |
|   while ((0 < take) && (take < byte_length_) &&
 | |
|          ((next_byte_[take] & 0xc0) == 0x80)) {
 | |
|     // Back up over continuation byte
 | |
|     --take;
 | |
|     --put;
 | |
|   }
 | |
| 
 | |
|   // Update input position
 | |
|   next_byte_ += take;
 | |
|   byte_length_ -= take;
 | |
| 
 | |
|   // Put four more spaces/NUL. Worst case is abcd _ _ _ \0
 | |
|   //                          kMaxScriptBytes |   | put
 | |
|   script_buffer_[put + 0] = ' ';
 | |
|   script_buffer_[put + 1] = ' ';
 | |
|   script_buffer_[put + 2] = ' ';
 | |
|   script_buffer_[put + 3] = '\0';
 | |
|   map2original_.Insert(4);
 | |
|   map2original_.Reset();
 | |
| 
 | |
|   span->text_bytes = put;       // Does not include the last four chars above
 | |
|   return true;
 | |
| }
 | |
| 
 | |
| // Force Latin, Cyrillic, Armenian, Greek scripts to be lowercase
 | |
| // List changes with each version of Unicode, so just always lowercase
 | |
| // Unicode 6.2.0:
 | |
| //   ARMENIAN COPTIC CYRILLIC DESERET GEORGIAN GLAGOLITIC GREEK LATIN
 | |
| void ScriptScanner::LowerScriptSpan(LangSpan* span) {
 | |
|   // If needed, lowercase all the text. If we do it sooner, might miss
 | |
|   // lowercasing an entity such as Á
 | |
|   // We only need to do this for Latn and Cyrl scripts
 | |
|   map2uplow_.Clear();
 | |
|   // Full Unicode lowercase of the entire buffer, including
 | |
|   // four pad bytes off the end.
 | |
|   // Ahhh. But the last byte 0x00 is not interchange-valid, so we do 3 pad
 | |
|   // bytes and put the 0x00 in explicitly.
 | |
|   // Build an offset map from script_buffer_lower_ back to script_buffer_
 | |
|   int consumed, filled, changed;
 | |
|   StringPiece istr(span->text, span->text_bytes + 3);
 | |
|   StringPiece ostr(script_buffer_lower_, kMaxScriptLowerBuffer);
 | |
| 
 | |
|   UTF8GenericReplace(&utf8repl_lettermarklower_obj,
 | |
|                             istr, ostr, is_plain_text_,
 | |
|                             &consumed, &filled, &changed, &map2uplow_);
 | |
|   script_buffer_lower_[filled] = '\0';
 | |
|   span->text = script_buffer_lower_;
 | |
|   span->text_bytes = filled - 3;
 | |
|   map2uplow_.Reset();
 | |
| }
 | |
| 
 | |
| // Copy next run of same-script non-tag letters to buffer [NUL terminated]
 | |
| // Force Latin, Cyrillic, Greek scripts to be lowercase
 | |
| // Buffer ALWAYS has leading space and trailing space space space NUL
 | |
| bool ScriptScanner::GetOneScriptSpanLower(LangSpan* span) {
 | |
|   bool ok = GetOneScriptSpan(span);
 | |
|   LowerScriptSpan(span);
 | |
|   return ok;
 | |
| }
 | |
| 
 | |
| 
 | |
| // Maps byte offset in most recent GetOneScriptSpan/Lower
 | |
| // span->text [0..text_bytes] into an additional byte offset from
 | |
| // span->offset, to get back to corresponding text in the original
 | |
| // input buffer.
 | |
| // text_offset must be the first byte
 | |
| // of a UTF-8 character, or just beyond the last character. Normally this
 | |
| // routine is called with the first byte of an interesting range and
 | |
| // again with the first byte of the following range.
 | |
| int ScriptScanner::MapBack(int text_offset) {
 | |
|   return map2original_.MapBack(map2uplow_.MapBack(text_offset));
 | |
| }
 | |
| 
 | |
| 
 | |
| // Gets lscript number for letters; always returns
 | |
| //   0 (common script) for non-letters
 | |
| int GetUTF8LetterScriptNum(const char* src) {
 | |
|   int srclen = UTF8OneCharLen(src);
 | |
|   const uint8* usrc = reinterpret_cast<const uint8*>(src);
 | |
|   return UTF8GenericPropertyTwoByte(&utf8prop_lettermarkscriptnum_obj,
 | |
|                                     &usrc, &srclen);
 | |
| }
 | |
| 
 | |
| }  // namespace CLD2
 | |
| 
 | |
| 
 | 
