Skip to content
Snippets Groups Projects
Commit 030bdf76 authored by sidchat@google.com's avatar sidchat@google.com
Browse files

Add Compact Language Detection (CLD) library to Chrome. This works in Windows only currently.

BUG=none
TEST=none
Review URL: http://codereview.chromium.org/122007

git-svn-id: svn://svn.chromium.org/chrome/trunk/src@18445 0039d316-1c4b-4281-b951-d872f2087c98
parent 30b0252c
No related merge requests found
Showing
with 50195 additions and 0 deletions
......@@ -63,6 +63,7 @@
'../sandbox/sandbox.gyp:*',
'../third_party/bsdiff/bsdiff.gyp:*',
'../third_party/bspatch/bspatch.gyp:*',
'../third_party/cld/cld.gyp:*',
'../third_party/tcmalloc/tcmalloc.gyp:*',
'../tools/memory_watcher/memory_watcher.gyp:*',
'../webkit/activex_shim/activex_shim.gyp:*',
......
......@@ -1756,6 +1756,7 @@
'../google_update/google_update.gyp:google_update',
'installer/installer.gyp:installer_util',
'../printing/printing.gyp:printing',
'../third_party/cld/cld.gyp:cld',
'../views/views.gyp:views',
'../gears/gears.gyp:gears',
],
......@@ -3959,6 +3960,7 @@
'../third_party/bsdiff/bsdiff.gyp:*',
'../third_party/bspatch/bspatch.gyp:*',
'../third_party/bzip2/bzip2.gyp:*',
'../third_party/cld/cld.gyp:cld',
'../third_party/codesighs/codesighs.gyp:*',
'../third_party/ffmpeg/ffmpeg.gyp:*',
'../third_party/icu38/icu38.gyp:*',
......@@ -4008,6 +4010,7 @@
#'theme_dll',
'worker',
'../net/net.gyp:net_resources',
'../third_party/cld/cld.gyp:cld',
'../third_party/tcmalloc/tcmalloc.gyp:tcmalloc',
'../views/views.gyp:views',
'../webkit/webkit.gyp:webkit_resources',
......
CLD - list of changes (sidchat, May 2009)
- cld_interface.cc - comment out
//#include "cld/bar/common/component.h"
//#include "cld/bar/common/execute/execute_utils.h"
and comment out requirements in function ::GetVerifiedDllFileName
-------------
Commented out the following in commandlineflags.h
class FlagSaver {
public:
FlagSaver();
~FlagSaver();
private:
class FlagSaverImpl* impl_; // we use pimpl here to keep API steady
FlagSaver(const FlagSaver&); // no copying!
void operator=(const FlagSaver&);
}
#ifndef SWIG // swig seems to have trouble with this for some reason
ATTRIBUTE_UNUSED
#endif
;
----------------
\ No newline at end of file
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef BAR_COMMON_SCOPEDLIBRARY_H_
#define BAR_COMMON_SCOPEDLIBRARY_H_
// A scoped object to safely load and free a DLL referenced by name.
// Provides an access to a handle to loaded library (HMODULE type).
//
// Example:
// ScopedLibrary library(LIBRARY_NAME);
// ... = ::GetProcAddress(library.handle(), FUNCTION_NAME);
class ScopedLibrary {
public:
// Always creates initialized ScopedLibrary.
// [in] file_name - library's file name.
explicit ScopedLibrary(const TCHAR *file_name)
: library_(::LoadLibrary(file_name)) {}
// Unloads owned library, if any.
~ScopedLibrary() {
if (library_ != NULL)
::FreeLibrary(library_);
}
inline HMODULE handle() const { return library_; }
// Returns true if library was loaded successfully.
bool IsValid() const { return library_ != NULL; }
private:
// Handle to loaded library.
const HMODULE library_;
DISALLOW_COPY_AND_ASSIGN(ScopedLibrary);
};
// A class representing a pointer to a function retrieved from DLL.
// FunctionPrototype is a regular C-style pointer-to-function type
// definition. For example, type of WinAPI IsValidSid function:
// BOOL (WINAPI*)(PSID)
//
// Example:
// FunctionFromDll<BOOL (WINAPI*)(PSID)> is_valid_sid;
// ... = is_valid_sid.function()(...);
template<typename FunctionPrototype>
class FunctionFromDll {
public:
FunctionFromDll() : function_(NULL) {}
// Binds this object to a function from DLL.
// [in] library - handle to a library containing a function.
// Must not be NULL.
// [in] name - name of the function.
void Bind(HMODULE library, const char *name) {
function_ =
reinterpret_cast<FunctionPrototype>(::GetProcAddress(library, name));
}
inline FunctionPrototype function() const { return function_; }
// Returns true if function was bound successfully.
bool IsValid() const { return function_ != NULL; }
private:
// Pointer to the function.
FunctionPrototype function_;
DISALLOW_COPY_AND_ASSIGN(FunctionFromDll);
};
#endif // BAR_COMMON_SCOPEDLIBRARY_H_
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef BAR_COMMON_SCOPEDPTR_H_
#define BAR_COMMON_SCOPEDPTR_H_
// Boxer for dumb types, allows you to associate cleanup code when the object
// falls off the stack. Destructor implementation must be provided for each
// type.
template < class T >
class ScopedObject {
public:
explicit ScopedObject(const T& v) : v_(v) { }
~ScopedObject();
operator T() const { return v_; }
T get() const { return v_; }
private:
T v_;
DISALLOW_COPY_AND_ASSIGN(ScopedObject);
};
// A scoped object for the various HANDLE- and LPVOID-based types.
// destroy() implementation must be provided for each type.
// Added by Breen Hagan of Google.
template < class T, int DIFFERENTIATOR >
class ScopedHandle {
public:
explicit ScopedHandle(const T& v) : v_(v) {}
~ScopedHandle() {
destroy();
}
operator T() const { return v_; }
T get() const { return v_; }
void reset(const T& v) {
if (v_ != v) {
destroy();
v_ = v;
}
}
// Swap two scoped handlers.
void swap(ScopedHandle& h2) {
T tmp = v_;
v_ = h2.v_;
h2.v_ = tmp;
}
T release() {
T released_value(v_);
v_ = 0;
return released_value;
}
private:
void destroy();
T v_;
DISALLOW_COPY_AND_ASSIGN(ScopedHandle);
};
// Free functions.
template <class T, int DIFFERENTIATOR>
inline void swap(ScopedHandle<T, DIFFERENTIATOR>& h1,
ScopedHandle<T, DIFFERENTIATOR>& h2) {
h1.swap(h2);
}
// Uses ScopedHandle to automatically call CloseHandle().
typedef ScopedHandle< HANDLE, 1 > SAFE_HANDLE;
template <>
inline void ScopedHandle< HANDLE, 1 >::destroy() {
if (v_)
::CloseHandle(v_);
}
// Uses ScopedHandle to automatically call CryptReleaseContext().
typedef ScopedHandle< HCRYPTPROV, 2 > SAFE_HCRYPTPROV;
template <>
inline void ScopedHandle< HCRYPTPROV, 2 >::destroy() {
if (v_)
::CryptReleaseContext(v_, 0);
}
// Uses ScopedHandle to automatically call CryptDestroyKey().
typedef ScopedHandle< HCRYPTKEY, 3 > SAFE_HCRYPTKEY;
template <>
inline void ScopedHandle< HCRYPTKEY, 3 >::destroy() {
if (v_)
::CryptDestroyKey(v_);
}
// Uses ScopedHandle to automatically call CryptDestroyHash().
typedef ScopedHandle< HCRYPTHASH, 4 > SAFE_HCRYPTHASH;
template <>
inline void ScopedHandle< HCRYPTHASH, 4 >::destroy() {
if (v_)
::CryptDestroyHash(v_);
}
// Uses ScopedHandle to automatically call UnmapViewOfFile().
typedef ScopedHandle< LPVOID, 5 > SAFE_MAPPEDVIEW;
template <>
inline void ScopedHandle< LPVOID, 5 >::destroy() {
if (v_)
::UnmapViewOfFile(v_);
}
// SAFE_HINTERNET
// Uses ScopedHandle to automatically call InternetCloseHandle().
typedef ScopedHandle< HINTERNET, 6 > SAFE_HINTERNET;
template <>
inline void ScopedHandle< HINTERNET, 6 >::destroy() {
if (v_)
::InternetCloseHandle(v_);
}
// SAFE_HMODULE
// Uses ScopedHandle to automatically call ::FreeLibrary().
typedef ScopedHandle< HMODULE, 7 > SAFE_HMODULE;
template <>
inline void ScopedHandle< HMODULE, 7 >::destroy() {
if (v_)
::FreeLibrary(v_);
}
// SAFE_RESOURCE
// Uses ScopedHandle to automatically call ::FreeResource().
// The type is HGLOBAL for backward compatibility, see MSDN, LoadResource()
// function for details.
typedef ScopedHandle< HGLOBAL, 8 > SAFE_RESOURCE;
template <>
inline void ScopedHandle< HGLOBAL, 8 >::destroy() {
if (v_)
::FreeResource(v_);
}
// ScopedIntCounter is a class that will increment given integet on construction
// and decrement it when the class is destructed.
class ScopedIntCounter {
public:
ScopedIntCounter(int *counter):
counter_(counter) {
(*counter_)++;
}
~ScopedIntCounter() {
(*counter_)--;
}
int count() {
return *counter_;
}
private:
int* counter_;
};
#endif // BAR_COMMON_SCOPEDPTR_H_
This diff is collapsed.
This diff is collapsed.
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef I18N_ENCODINGS_COMPACT_LANG_DET_CLDUTIL_DBG_H_
#define I18N_ENCODINGS_COMPACT_LANG_DET_CLDUTIL_DBG_H_
#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/cldutil.h"
#include <string>
#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/ext_lang_enc.h"
#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/tote.h"
#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_basictypes.h"
#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_commandlineflags.h"
DECLARE_bool(dbgscore);
DECLARE_bool(dbglookup);
DECLARE_bool(dbgreli);
namespace cld {
//------------------------------------------------------------------------------
// Debugging. Not thread safe
//------------------------------------------------------------------------------
void DbgScoreInit(const char* src, int len);
// Return a 3-byte + NUL code for language
void DbgLangName3(Language lang, char* temp);
// Show all per-language totals
void DbgScoreState();
void DbgScoreTop(const char* src, int srclen, Tote* chunk_tote);
void DbgScoreFlush();
// Allow additional scoring debug output
void DbgScoreRecord(const char* src, uint32 probs, int len);
void DbgScoreRecordUni(const char* src, int propval, int len);
// Debug print language name(s)
void PrintLang(FILE* f, const Tote* chunk_tote,
const Language cur_lang, const bool cur_unreliable,
Language prior_lang, bool prior_unreliable);
// Debug print language name(s)
void PrintLang2(FILE* f,
const Language lang1, const Language lang2, bool diff_prior);
// Debug print text span
void PrintText(FILE* f, Language cur_lang, const string& str);
// Debug print text span with speculative language
void PrintTextSpeculative(FILE* f, Language cur_lang, const string& str);
// Debug print ignored text span
void PrintSkippedText(FILE* f, const string& str);
void DbgProbsToStderr(uint32 probs);
void DbgUniTermToStderr(int propval, const uint8* usrc, int len);
// No pre/post space
void DbgBiTermToStderr(uint32 bihash, uint32 probs,
const char* src, int len);
void DbgQuadTermToStderr(uint32 quadhash, uint32 probs,
const char* src, int len);
void DbgWordTermToStderr(uint64 wordhash, uint32 probs,
const char* src, int len);
} // End namespace cld
#endif // I18N_ENCODINGS_COMPACT_LANG_DET_CLDUTIL_DBG_H_
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/cldutil_dbg.h"
#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_commandlineflags.h"
DEFINE_bool(dbgscore, false, "Print picture of score calculation");
DEFINE_bool(dbglookup, false, "Print every quad/uni lookup in score calc");
DEFINE_bool(dbgreli, false, "Print reliability in score calc");
namespace cld {
//------------------------------------------------------------------------------
// Debugging. Not thread safe
// This is the empty version -- routines return immediately
//------------------------------------------------------------------------------
void DbgScoreInit(const char* src, int len) {};
// Return a 3-byte + NUL code for language
void DbgLangName3(Language lang, char* temp) {};
// Show all per-language totals
void DbgScoreState() {};
void DbgScoreTop(const char* src, int srclen, Tote* chunk_tote) {};
void DbgScoreFlush() {};
// Allow additional scoring debug output
void DbgScoreRecord(const char* src, uint32 probs, int len) {};
void DbgScoreRecordUni(const char* src, int propval, int len) {};
// Debug print language name(s)
void PrintLang(FILE* f, const Tote* chunk_tote,
const Language cur_lang, const bool cur_unreliable,
Language prior_lang, bool prior_unreliable) {};
// Debug print language name(s)
void PrintLang2(FILE* f,
const Language lang1, const Language lang2, bool diff_prior) {};
// Debug print text span
void PrintText(FILE* f, Language cur_lang, const string& str) {};
// Debug print text span with speculative language
void PrintTextSpeculative(FILE* f, Language cur_lang, const string& str) {};
// Debug print ignored text span
void PrintSkippedText(FILE* f, const string& str) {};
void DbgProbsToStderr(uint32 probs) {};
void DbgUniTermToStderr(int propval, const uint8* usrc, int len) {};
// No pre/post space
void DbgBiTermToStderr(uint32 bihash, uint32 probs,
const char* src, int len) {};
void DbgQuadTermToStderr(uint32 quadhash, uint32 probs,
const char* src, int len) {};
void DbgWordTermToStderr(uint64 wordhash, uint32 probs,
const char* src, int len) {};
} // End namespace cld
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/compact_lang_det.h"
#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/compact_lang_det_impl.h"
#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_basictypes.h"
// String is "code_version - data_scrape_date"
static const char* kDetectLanguageVersion = "V1.6 - 20081121";
// Large-table version for all ~160 languages (all Tiers)
// Scan interchange-valid UTF-8 bytes and detect most likely language
Language CompactLangDet::DetectLanguage(
const char* buffer,
int buffer_length,
bool is_plain_text,
bool* is_reliable) {
bool allow_extended_lang = false;
Language language3[3];
int percent3[3];
double normalized_score3[3];
int text_bytes;
int flags = 0;
Language plus_one = UNKNOWN_LANGUAGE;
const char* tld_hint = "";
int encoding_hint = UNKNOWN_ENCODING;
Language language_hint = UNKNOWN_LANGUAGE;
Language lang = CompactLangDetImpl::DetectLanguageSummaryV25(
buffer,
buffer_length,
is_plain_text,
tld_hint, // "id" boosts Indonesian
encoding_hint, // SJS boosts Japanese
language_hint, // ITALIAN boosts it
allow_extended_lang,
flags,
plus_one,
language3,
percent3,
normalized_score3,
&text_bytes,
is_reliable);
// Default to English.
if (lang == UNKNOWN_LANGUAGE) {
lang = ENGLISH;
}
return lang;
}
// Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
Language CompactLangDet::DetectLanguageSummary(
const char* buffer,
int buffer_length,
bool is_plain_text,
Language* language3,
int* percent3,
int* text_bytes,
bool* is_reliable) {
double normalized_score3[3];
bool allow_extended_lang = false;
int flags = 0;
Language plus_one = UNKNOWN_LANGUAGE;
const char* tld_hint = "";
int encoding_hint = UNKNOWN_ENCODING;
Language language_hint = UNKNOWN_LANGUAGE;
Language lang = CompactLangDetImpl::DetectLanguageSummaryV25(
buffer,
buffer_length,
is_plain_text,
tld_hint, // "id" boosts Indonesian
encoding_hint, // SJS boosts Japanese
language_hint, // ITALIAN boosts it
allow_extended_lang,
flags,
plus_one,
language3,
percent3,
normalized_score3,
text_bytes,
is_reliable);
// Default to English
if (lang == UNKNOWN_LANGUAGE) {
lang = ENGLISH;
}
return lang;
}
// Same as above, with hints supplied
// Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
Language CompactLangDet::DetectLanguageSummary(
const char* buffer,
int buffer_length,
bool is_plain_text,
const char* tld_hint, // "id" boosts Indonesian
int encoding_hint, // SJS boosts Japanese
Language language_hint, // ITALIAN boosts it
Language* language3,
int* percent3,
int* text_bytes,
bool* is_reliable) {
double normalized_score3[3];
bool allow_extended_lang = false;
int flags = 0;
Language plus_one = UNKNOWN_LANGUAGE;
Language lang = CompactLangDetImpl::DetectLanguageSummaryV25(
buffer,
buffer_length,
is_plain_text,
tld_hint, // "id" boosts Indonesian
encoding_hint, // SJS boosts Japanese
language_hint, // ITALIAN boosts it
allow_extended_lang,
flags,
plus_one,
language3,
percent3,
normalized_score3,
text_bytes,
is_reliable);
// Default to English
if (lang == UNKNOWN_LANGUAGE) {
lang = ENGLISH;
}
return lang;
}
// Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
// languages.
// Extended languages are additional Google interface languages and Unicode
// single-language scripts, from ext_lang_enc.h
Language CompactLangDet::ExtDetectLanguageSummary(
const char* buffer,
int buffer_length,
bool is_plain_text,
Language* language3,
int* percent3,
int* text_bytes,
bool* is_reliable) {
double normalized_score3[3];
bool allow_extended_lang = true;
int flags = 0;
Language plus_one = UNKNOWN_LANGUAGE;
const char* tld_hint = "";
int encoding_hint = UNKNOWN_ENCODING;
Language language_hint = UNKNOWN_LANGUAGE;
Language lang = CompactLangDetImpl::DetectLanguageSummaryV25(
buffer,
buffer_length,
is_plain_text,
tld_hint, // "id" boosts Indonesian
encoding_hint, // SJS boosts Japanese
language_hint, // ITALIAN boosts it
allow_extended_lang,
flags,
plus_one,
language3,
percent3,
normalized_score3,
text_bytes,
is_reliable);
// Do not default to English
return lang;
}
// Same as above, with hints supplied
// Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
// languages.
// Extended languages are additional Google interface languages and Unicode
// single-language scripts, from ext_lang_enc.h
Language CompactLangDet::ExtDetectLanguageSummary(
const char* buffer,
int buffer_length,
bool is_plain_text,
const char* tld_hint, // "id" boosts Indonesian
int encoding_hint, // SJS boosts Japanese
Language language_hint, // ITALIAN boosts it
Language* language3,
int* percent3,
int* text_bytes,
bool* is_reliable) {
double normalized_score3[3];
bool allow_extended_lang = true;
int flags = 0;
Language plus_one = UNKNOWN_LANGUAGE;
Language lang = CompactLangDetImpl::DetectLanguageSummaryV25(
buffer,
buffer_length,
is_plain_text,
tld_hint, // "id" boosts Indonesian
encoding_hint, // SJS boosts Japanese
language_hint, // ITALIAN boosts it
allow_extended_lang,
flags,
plus_one,
language3,
percent3,
normalized_score3,
text_bytes,
is_reliable);
// Do not default to English
return lang;
}
// Same as above, and also returns internal language scores as a ratio to
// normal score for real text in that language. Scores close to 1.0 indicate
// normal text, while scores far away from 1.0 indicate badly-skewed text or
// gibberish
//
Language CompactLangDet::ExtDetectLanguageSummary(
const char* buffer,
int buffer_length,
bool is_plain_text,
const char* tld_hint, // "id" boosts Indonesian
int encoding_hint, // SJS boosts Japanese
Language language_hint, // ITALIAN boosts it
Language* language3,
int* percent3,
double* normalized_score3,
int* text_bytes,
bool* is_reliable) {
bool allow_extended_lang = true;
int flags = 0;
Language plus_one = UNKNOWN_LANGUAGE;
Language lang = CompactLangDetImpl::DetectLanguageSummaryV25(
buffer,
buffer_length,
is_plain_text,
tld_hint, // "id" boosts Indonesian
encoding_hint, // SJS boosts Japanese
language_hint, // ITALIAN boosts it
allow_extended_lang,
flags,
plus_one,
language3,
percent3,
normalized_score3,
text_bytes,
is_reliable);
// Do not default to English
return lang;
}
// Return version text string
// String is "code_version - data_scrape_date"
const char* CompactLangDet::DetectLanguageVersion() {
return kDetectLanguageVersion;
}
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// NOTE:
// This code has not yet been evaluated against LangId, which is the official
// production language identification system. However, it seems to be of
// similar precison overall, and it covers all the Google languages in
// i18n/languages/proto/languages.proto
// except the four Creoles_and_Pigins.
// Baybayin (ancient script of the Philippines) is detected as TAGALOG.
// Chu Nom (Vietnamese ancient Han characters) is detected as VIETNAMESE.
// HAITIAN_CREOLE is detected as such.
// NORWEGIAN and NORWEGIAN_N are detected separately (but not robustly)
// PORTUGUESE, PORTUGUESE_P, and PORTUGUESE_B are all detected as PORTUGUESE.
// ROMANIAN-Latin is detected as ROMANIAN; ROMANIAN-Cyrillic as MOLDAVIAN.
// SERBO_CROATIAN, BOSNIAN, CROATIAN, SERBIAN, MONTENEGRIN in the Latin script
// are all detected as CROATIAN; in the Cyrillic script as SERBIAN.
// Zhuang is detected in the Latin script only.
//
// The Google interface languages X_PIG_LATIN and X_KLINGON are detected in the
// extended calls ExtDetectLanguageSummary(). BorkBorkBork, ElmerFudd, and
// Hacker are not detected (too little training data).
//
// UNKNOWN_LANGUAGE is returned if no language's internal reliablity measure
// is high enough. This happens with non-text input such as the bytes of a
// JPEG, and also with some text in languages outside the Google Language
// enum, such as Ilonggo.
//
// The following languages are detected in multiple scripts:
// AZERBAIJANI (Latin, Cyrillic*, Arabic*)
// BURMESE (Latin, Myanmar)
// HAUSA (Latin, Arabic)
// KASHMIRI (Arabic, Devanagari)
// KAZAKH (Latin, Cyrillic, Arabic)
// KURDISH (Latin*, Arabic)
// KYRGYZ (Cyrillic, Arabic)
// LIMBU (Devanagari, Limbu)
// MONGOLIAN (Cyrillic, Mongolian)
// SANSKRIT (Latin, Devanagari)
// SINDHI (Arabic, Devanagari)
// TAGALOG (Latin, Tagalog)
// TAJIK (Cyrillic, Arabic*)
// TATAR (Latin, Cyrillic, Arabic)
// TURKMEN (Latin, Cyrillic, Arabic)
// UIGHUR (Latin, Cyrillic, Arabic)
// UZBEK (Latin, Cyrillic, Arabic)
//
// * Due to a shortage of training text, AZERBAIJANI is not currently detected
// in Arabic or Cyrillic scripts, nor KURDISH in Latin script, nor TAJIK in
// Arabic script.
//
#ifndef I18N_ENCODINGS_COMPACT_LANG_DET_COMPACT_LANG_DET_H_
#define I18N_ENCODINGS_COMPACT_LANG_DET_COMPACT_LANG_DET_H_
#include "third_party/cld/bar/toolbar/cld/i18n/languages/public/languages.h"
#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/compact_lang_det_impl.h"
namespace CompactLangDet {
// Scan interchange-valid UTF-8 bytes and detect most likely language,
// or set of languages.
//
// Design goals:
// Skip over big stretches of HTML tags
// Able to return ranges of different languages
// Relatively small tables and relatively fast processing
// Thread safe
//
// For HTML documents, tags are skipped, along with <script> ... </script>
// and <style> ... </style> sequences, and entities are expanded.
//
// We distinguish between bytes of the raw input buffer and bytes of non-tag
// text letters. Since tags can be over 50% of the bytes of an HTML Page,
// and are nearly all seven-bit ASCII English, we prefer to distinguish
// language mixture fractions based on just the non-tag text.
//
// Inputs: text and text_length
// Code skips HTML tags and expands HTML entities, unless
// is_plain_text is true
// Outputs:
// language3 is an array of the top 3 languages or UNKNOWN_LANGUAGE
// percent3 is an array of the text percentages 0..100 of the top 3 languages
// text_bytes is the amount of non-tag/letters-only text found
// is_reliable set true if the returned Language is some amount more
// probable then the second-best Language. Calculation is a complex function
// of the length of the text and the different-script runs of text.
// Return value: the most likely Language for the majority of the input text
// Length 0 input returns UNKNOWN_LANGUAGE. Very short indeterminate text
// defaults to ENGLISH.
//
// The first two versions return ENGLISH instead of UNKNOWN_LANGUAGE, for
// backwards compatibility with LLD.
//
// The third version may return UNKNOWN_LANGUAGE, and also returns extended
// language codes from ext_lang_enc.h
//
// Subsetting: For fast detection over large documents, these routines will
// scan non-tag text of the initial part of a document, then will
// skip 4-16 bytes and subsample text in the rest of the document, up to a
// fixed limit (currently 160KB of non-tag letters).
//
// Scan interchange-valid UTF-8 bytes and detect most likely language
Language DetectLanguage(
const char* buffer,
int buffer_length,
bool is_plain_text,
bool* is_reliable);
// Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
// language3[0] is also the return value
Language DetectLanguageSummary(
const char* buffer,
int buffer_length,
bool is_plain_text,
Language* language3,
int* percent3,
int* text_bytes,
bool* is_reliable);
// Same as above, with hints supplied
// Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
// language3[0] is also the return value
Language DetectLanguageSummary(
const char* buffer,
int buffer_length,
bool is_plain_text,
const char* tld_hint, // "id" boosts Indonesian
int encoding_hint, // SJS boosts Japanese
Language language_hint, // ITALIAN boosts it
Language* language3,
int* percent3,
int* text_bytes,
bool* is_reliable);
// Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
// languages.
//
// Extended languages are additional Google interface languages and Unicode
// single-language scripts, from ext_lang_enc.h. They are experimental and
// this call may be removed.
//
// language3[0] is also the return value
Language ExtDetectLanguageSummary(
const char* buffer,
int buffer_length,
bool is_plain_text,
Language* language3,
int* percent3,
int* text_bytes,
bool* is_reliable);
// Same as above, with hints supplied
// Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
// languages.
//
// Extended languages are additional Google interface languages and Unicode
// single-language scripts, from ext_lang_enc.h. They are experimental and
// this call may be removed.
//
// language3[0] is also the return value
Language ExtDetectLanguageSummary(
const char* buffer,
int buffer_length,
bool is_plain_text,
const char* tld_hint, // "id" boosts Indonesian
int encoding_hint, // SJS boosts Japanese
Language language_hint, // ITALIAN boosts it
Language* language3,
int* percent3,
int* text_bytes,
bool* is_reliable);
// Same as above, and also returns internal language scores as a ratio to
// normal score for real text in that language. Scores close to 1.0 indicate
// normal text, while scores far away from 1.0 indicate badly-skewed text or
// gibberish
//
Language ExtDetectLanguageSummary(
const char* buffer,
int buffer_length,
bool is_plain_text,
const char* tld_hint, // "id" boosts Indonesian
int encoding_hint, // SJS boosts Japanese
Language language_hint, // ITALIAN boosts it
Language* language3,
int* percent3,
double* normalized_score3,
int* text_bytes,
bool* is_reliable);
// Return version text string
// String is "code_version - data_scrape_date"
const char* DetectLanguageVersion();
}; // End namespace CompactLangDet
#endif // I18N_ENCODINGS_COMPACT_LANG_DET_COMPACT_LANG_DET_H_
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/cldutil.h"
// Suppressed:
// az-Arab az-Cyrl ku-Latn tg-Arab za-Hani zzb-Latn zze-Latn zzh-Latn ru-Latn
// Remapped:
// xxx-Latn=>ut-Latn sh-Latn=>hr-Latn sh-Cyrl=>sr-Cyrl
static const int kCjkBiTableBuildDate = 20090129; // yyyymmdd
static const int kCjkBiTableSize = 1; // Bucket count
static const int kCjkBiTableKeyMask = 0xffffffff; // Mask hash key
COMPILE_ASSERT(MONTENEGRIN == 160, k_montenegrin_changed);
COMPILE_ASSERT(EXT_NUM_LANGUAGES == 209, k_ext_num_languages_changed);
// Empty table
static const cld::IndirectProbBucket4 kCjkBiTable[kCjkBiTableSize] = {
// key[4], words[4] in UTF-8
// value[4]
{ {0x00000000,0x00000000,0x00000000,0x00000000}}, // [000] c
};
static const uint32 kCjkBiTableInd[1] = {
// [0000]
0x00000000, };
COMPILE_ASSERT(1 < (1 << 16), k_indirectbits_too_small);
extern const cld::CLDTableSummary kCjkBiTable_obj = {
kCjkBiTable,
kCjkBiTableInd,
kCjkBiTableSize,
ARRAYSIZE(kCjkBiTableInd),
kCjkBiTableKeyMask,
kCjkBiTableBuildDate,
};
// End of generated tables
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/cldutil.h"
// Suppressed:
// az-Arab az-Cyrl ku-Latn tg-Arab za-Hani zzb-Latn zze-Latn zzh-Latn ru-Latn
// Remapped:
// xxx-Latn=>ut-Latn sh-Latn=>hr-Latn sh-Cyrl=>sr-Cyrl
// ms/id probabilities leveled
static const int kLongWord8TableBuildDate = 20081007; // yyyymmdd
COMPILE_ASSERT(MONTENEGRIN == 160, k_montenegrin_changed);
COMPILE_ASSERT(EXT_NUM_LANGUAGES == 209, k_ext_num_languages_changed);
static const int kLongWord8TableSize = 1; // Bucket count
static const int kLongWord8TableKeyMask = 0xffffffff; // Mask hash key
COMPILE_ASSERT(MONTENEGRIN == 160, k_montenegrin_changed);
COMPILE_ASSERT(EXT_NUM_LANGUAGES == 209, k_ext_num_languages_changed);
// Empty table
static const cld::IndirectProbBucket4 kLongWord8Table[kLongWord8TableSize] = {
// key[4], words[4] in UTF-8
// value[4]
{ {0x00000000,0x00000000,0x00000000,0x00000000}}, // [000] c
};
static const uint32 kLongWord8TableInd[1] = {
// [0000]
0x00000000, };
COMPILE_ASSERT(1 < (1 << 16), k_indirectbits_too_small);
extern const cld::CLDTableSummary kLongWord8Table_obj = {
kLongWord8Table,
kLongWord8TableInd,
kLongWord8TableSize,
ARRAYSIZE(kLongWord8TableInd),
kLongWord8TableKeyMask,
kLongWord8TableBuildDate,
};
// End of generated tables
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef I18N_ENCODINGS_COMPACT_LANG_DET_COMPACT_LANG_DET_GENERATED_MEANSCORE_H__
#define I18N_ENCODINGS_COMPACT_LANG_DET_COMPACT_LANG_DET_GENERATED_MEANSCORE_H__
#if 0
// Generated 2008.05.08 with boot3abc
static const short kMeanScore[256 * 4] = {
769, 0, 0, 0, 1011, 0, 0, 0, 1263, 0, 0, 0, 1934, 0, 0, 0,
1039, 0, 0, 0, 1296, 0, 0, 0, 0, 0, 0, 1216, 907, 0, 0, 0,
0, 0, 0, 3032, 0, 0, 0, 3423, 971, 0, 0, 0, 1855, 0, 0, 0,
794, 0, 0, 0, 0, 1099, 0, 0, 733, 0, 0, 0, 1201, 0, 0, 0,
0, 0, 0, 1523, 1539, 0, 0, 0, 0, 0, 0, 1024, 1677, 0, 0, 0,
1929, 0, 0, 0, 1917, 0, 0, 0, 1414, 0, 0, 0, 1954, 0, 0, 0,
1183, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 921, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 1425, 0, 0, 0, 621, 0, 0, 0,
1498, 0, 0, 0, 1532, 0, 0, 0, 0, 1021, 0, 0, 0, 0, 0, 823,
0, 748, 0, 0, 0, 0, 0, 863, 1543, 0, 0, 0, 854, 0, 0, 0,
1131, 0, 0, 0, 0, 0, 0, 1024, 1539, 0, 0, 0, 0, 0, 0, 907,
0, 0, 0, 1024, 1257, 0, 0, 0, 0, 0, 0, 1024, 0, 1029, 0, 0,
893, 0, 0, 0, 599, 0, 0, 0, 0, 0, 1241, 0, 0, 0, 0, 642,
0, 0, 0, 1024, 0, 0, 0, 1024, 0, 0, 1165, 0, 875, 0, 0, 0,
826, 0, 0, 0, 1225, 0, 0, 0, 369, 0, 0, 0, 0, 0, 0, 1024,
0, 0, 0, 1024, 1667, 0, 0, 0, 1021, 0, 0, 0, 1579, 0, 0, 0,
0, 0, 0, 594, 1226, 0, 0, 0, 1873, 0, 0, 0, 1041, 0, 0, 0,
1528, 0, 0, 0, 0, 0, 0, 1606, 1203, 0, 0, 0, 860, 0, 0, 0,
1303, 894, 1204, 0, 0, 0, 0, 714, 1679, 0, 0, 0, 0, 0, 0, 1024,
0, 0, 0, 817, 0, 0, 977, 0, 0, 0, 0, 0, 0, 0, 0, 1024,
663, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1326, 0, 0, 0,
1312, 0, 0, 0, 1480, 0, 0, 0, 1406, 0, 0, 0, 1605, 953, 0, 0,
0, 835, 1296, 0, 1205, 0, 0, 0, 1321, 0, 0, 0, 0, 0, 0, 1234,
944, 649, 0, 0, 1429, 0, 0, 0, 1402, 1109, 1055, 0, 0, 0, 1108, 0,
0, 1193, 0, 0, 0, 0, 0, 1024, 0, 0, 0, 1024, 0, 0, 1052, 0,
877, 0, 0, 0, 888, 0, 0, 0, 1170, 0, 0, 0, 1578, 0, 0, 1024,
0, 0, 0, 1024, 0, 0, 0, 895, 0, 0, 0, 1024, 0, 0, 0, 1024,
0, 0, 0, 0, 826, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 745,
965, 0, 0, 0, 645, 0, 0, 0, 927, 846, 0, 0, 1336, 0, 0, 0,
// 0, 655, 0, 0, 0, 0, 982, 0, 1778, 0, 0, 0, 1563, 0, 0, 0, // original
0,1233, 0, 0, 0, 0, 982, 0, 1778, 0, 0, 0, 1563, 0, 0, 0, // Moldavian[116] 2008.08.08
0, 746, 0, 0, 928, 509, 0, 0, 0, 0, 0, 0, 1226, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
// There are a lot of Afar false hits on lines with "radar" or "radares"
// Also lines with "libdata" and related
// So I artifically removed Afar from _rada, _libd etc. in quads table.
// 1308, 0, 0, 0, 1031, 0, 0, 0, 0, 1022, 0, 0, 0, 0, 0, 0, // original
1308, 0, 0, 0, 1031, 0, 0, 0, 0, 1022, 0, 0, 1762, 0, 0, 0, // Afar[131] 2008.09.05
1918, 0, 0, 0, 0, 958, 0, 0, 1761, 0, 0, 0, 0, 0, 0, 913,
1564, 0, 0, 0, 2155, 0, 0, 0, 1113, 0, 0, 0, 1402, 0, 0, 0,
2372, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1184, 0, 0, 0,
1650, 0, 0, 0, 1482, 0, 0, 0, 1746, 0, 0, 0, 1481, 0, 0, 0,
1313, 0, 0, 0, 1720, 0, 0, 0, 1579, 0, 0, 458, 1192, 0, 0, 0,
1346, 0, 0, 0, 1402, 0, 0, 0, 1462, 0, 0, 0, 2228, 0, 0, 1498,
// 0, 0, 0, 0, 1199, 0, 0, 0, 1462, 0, 0, 0, 1636, 0, 0, 0, // original
1376, 0, 0, 0, 1199, 0, 0, 0, 1462, 0, 0, 0, 1636, 0, 0, 0, // Khasi[156] 2008.09.05
0, 0, 0, 0, 2060, 0, 0, 0, 0, 0, 0, 0, 1836, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 1024, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};
#endif
#if 1
// Generated 2008.10.01 from
// /export/hda3/cld/20080409/b0123a_9000_samp_prune.utf8
// Four-byte quads, four-byte longwords (included here)
// Exteneded languages start at [165]
// ks-Deva fix included
//
static const short kMeanScore[256 * 4] = {
612, 0, 0, 0, 614, 0, 0, 0, 799, 0, 0, 0, 1310, 0, 0, 0,
678, 0, 0, 0, 887, 0, 0, 0, 0, 0, 0, 1073, 510, 0, 0, 0,
0, 0, 0, 3109, 0, 0, 0, 3423, 563, 0, 0, 0, 1406, 0, 0, 0,
509, 0, 0, 0, 0, 750, 0, 0, 449, 0, 0, 0, 825, 0, 0, 0,
0, 0, 0, 1820, 1153, 0, 0, 0, 0, 0, 0, 1024, 1443, 0, 0, 0,
1458, 0, 0, 0, 1320, 0, 0, 0, 1002, 0, 0, 0, 1518, 0, 0, 0,
972, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 606, 0, 0,
735, 0, 0, 0, 0, 557, 0, 0, 1098, 0, 0, 0, 496, 0, 0, 0,
1060, 0, 0, 0, 1293, 0, 0, 0, 0, 709, 0, 0, 0, 0, 0, 676,
0, 656, 0, 0, 0, 0, 0, 671, 545, 0, 0, 0, 698, 0, 0, 0,
584, 0, 0, 0, 0, 0, 0, 1024, 1422, 0, 0, 0, 0, 0, 0, 754,
0, 0, 0, 1024, 1182, 0, 0, 0, 0, 0, 0, 1024, 0, 860, 0, 0,
685, 0, 0, 0, 438, 0, 0, 0, 0, 0, 1111, 0, 0, 0, 0, 613,
0, 0, 0, 1024, 0, 0, 0, 1024, 0, 0, 1019, 0, 600, 0, 0, 0,
746, 0, 0, 0, 1001, 0, 0, 0, 350, 0, 0, 0, 0, 0, 0, 1024,
0, 0, 0, 1024, 1318, 0, 0, 0, 812, 0, 0, 0, 1130, 0, 0, 0,
0, 0, 0, 507, 972, 0, 0, 0, 1539, 0, 0, 0, 787, 0, 0, 0,
1174, 0, 0, 0, 0, 0, 0, 1780, 911, 0, 0, 0, 695, 0, 0, 0,
1074, 881, 968, 0, 0, 0, 0, 571, 1377, 0, 0, 0, 0, 0, 0, 1024,
0, 0, 0, 739, 0, 0, 876, 0, 0, 0, 0, 0, 0, 0, 0, 1024,
427, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 978, 0, 0, 0,
1031, 0, 0, 0, 1182, 0, 0, 0, 1094, 0, 0, 0, 1352, 907, 0, 0,
0, 790, 1060, 0, 950, 0, 0, 0, 1169, 0, 0, 0, 0, 0, 0, 1059,
0, 0, 0, 0, 1094, 0, 0, 0, 1127, 1023, 841, 0, 0, 0, 968, 0,
0, 1028, 0, 0, 0, 0, 0, 1024, 0, 0, 0, 1024, 0, 0, 891, 0,
688, 0, 0, 0, 664, 0, 0, 0, 864, 0, 0, 0, 1292, 0, 0, 1024,
0, 0, 0, 1024, 0, 0, 0, 753, 0, 0, 0, 1024, 0, 0, 0, 1024,
0, 0, 0, 0, 489, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 573,
755, 0, 0, 0, 565, 0, 0, 0, 727, 836, 917, 0, 1080, 0, 0, 0,
0, 583, 0, 0, 0, 0, 815, 0, 1425, 0, 0, 0, 1295, 0, 0, 0,
0, 912, 0, 0, 1210, 708, 0, 0, 0, 0, 0, 0, 988, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1194, 0, 0, 0, 840, 0, 0, 0, 0, 879, 0, 0, 0, 0, 0, 0,
1669, 0, 0, 0, 0, 846, 0, 0, 1451, 0, 0, 0, 0, 0, 0, 808,
1317, 0, 0, 0, 1685, 0, 0, 0, 911, 0, 0, 0, 1173, 0, 0, 0,
1897, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 860, 0, 0, 0,
1424, 0, 0, 0, 1100, 0, 0, 0, 1360, 0, 0, 0, 1166, 0, 0, 0,
1012, 0, 0, 0, 1749, 0, 0, 0, 1381, 0, 0, 513, 928, 0, 0, 0,
1147, 0, 0, 0, 1163, 0, 0, 0, 1029, 0, 0, 0, 1873, 0, 0, 0,
0, 0, 0, 0, 779, 0, 0, 0, 1130, 0, 0, 0, 1426, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 1781, 0, 0, 0, 0, 0, 0, 0,
1463, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1024, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};
#endif
#if 0
// Default value for starting over building this data
static const short kMeanScore[256 * 4] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};
#endif
#endif // I18N_ENCODINGS_COMPACT_LANG_DET_COMPACT_LANG_DET_GENERATED_MEANSCORE_H__
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef I18N_ENCODINGS_COMPACT_LANG_DET_COMPACT_LANG_DET_IMPL_H_
#define I18N_ENCODINGS_COMPACT_LANG_DET_COMPACT_LANG_DET_IMPL_H_
#include "third_party/cld/bar/toolbar/cld/i18n/encodings/lang_enc.h"
#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_basictypes.h"
static const int kCLDFlagFinish = 1;
static const int kCLDFlagSqueeze = 2;
static const int kCLDFlagRepeats = 4;
static const int kCLDFlagTop40 = 8;
static const int kCLDFlagShort = 16;
static const int kCLDFlagHint = 32; // Experimental, undebugged
static const int kCLDFlagUseWords = 64;
/***
Flag meanings:
Flags are used in the context of a recursive call from Detect to itself,
trying to deal in a more restrictive way with input that was not reliably
identified in the top-level call.
Finish -- Do not further recurse; return whatever result ensues, even if it is
unreliable. Typically set in any recursive call to take a second try
on unreliable text.
Squeeze -- For each text run, do an inplace cheapsqueeze to remove chunks of
highly repetitive text and chunks of text with too many 1- and
2-letter words. This avoids scoring repetitive or useless non-text
crap in large files such bogus JPEGs within an HTML file.
Repeats -- When scoring a text run, do a cheap prediction of each character
and do not score a unigram/quadgram if the last character of same is
correctly predicted. This is a slower, finer-grained form of
cheapsqueeze, typically used when the first pass got unreliable
results.
Top40 -- Restrict the set of scored languages to the Google "Top 40*", which is
actually 38 languages. This gets rid of about 110 language that
represent about 0.7% of the web. Typically used when the first pass
got unreliable results.
Short -- Use trigram (three letter) scoring instad of quadgrams. Restricted to
the top 40* languages, Latin and Cyrillic scripts only.
Not as precise as quadgrams, but it gives some plausible result on
1- or 2-word text in major languages.
Hint -- EXPERIMENTAL flag for compact_lang_det_test.cc to indicate a language
hint supplied in parameter plus_one.
UseWords -- In additon to scoring quad/uni/nil-grams, score complete words
Tentative decision logic:
In the middle of first pass -- After 4KB of text, look at the front 256 bytes
of every full 4KB buffer. If it compresses very well (say 3:1) or has
lots of spaces (say 1 of every 4 bytes), assume that the input is
large and contains lots of bogus non-text. Recurse, passing the
Squeeze flag to strip out chunks of this non-text.
At the end of the first pass --
If the top language is reliable and >= 70% of the document, return.
Else if the top language is reliable and top+2nd >= say 94%, return.
Else, either the top language is not reliable or there is a lot of
other crap.
***/
namespace CompactLangDetImpl {
// Scan interchange-valid UTF-8 bytes and detect most likely language,
// or set of languages.
//
// Design goals:
// Skip over big stretches of HTML tags
// Able to return ranges of different languages
// Relatively small tables and relatively fast processing
// Thread safe
//
typedef struct {
int perscript_count;
const Language* perscript_lang;
} PerScriptPair;
typedef struct {
// Constants for hashing 4-7 byte quadgram to 32 bits
const int kQuadHashB4Shift;
const int kQuadHashB4bShift;
const int kQuadHashB5Shift;
const int kQuadHashB5bShift;
// Constants for hashing 32 bits to kQuadKeyTable subscript/key
const int kHashvalToSubShift;
const uint32 kHashvalToSubMask;
const int kHashvalToKeyShift;
const uint32 kHashvalToKeyMask;
const int kHashvalAssociativity;
// Pointers to the actual tables
const PerScriptPair* kPerScriptPair;
const uint16* kQuadKeyTable;
const uint32* kQuadValueTable;
} LangDetObj;
// For HTML documents, tags are skipped, along with <script> ... </script>
// and <style> ... </style> sequences, and entities are expanded.
//
// We distinguish between bytes of the raw input buffer and bytes of non-tag
// text letters. Since tags can be over 50% of the bytes of an HTML Page,
// and are nearly all seven-bit ASCII English, we prefer to distinguish
// language mixture fractions based on just the non-tag text.
//
// Inputs: text and text_length
// is_plain_text if true says to NOT parse/skip HTML tags nor entities
// Outputs:
// language3 is an array of the top 3 languages or UNKNOWN_LANGUAGE
// percent3 is an array of the text percentages 0..100 of the top 3 languages
// normalized_score3 is an array of internal scores, normalized to the
// average score for each language over a body of training text. A
// normalized score significantly away from 1.0 indicates very skewed text
// or gibberish.
//
// text_bytes is the amount of non-tag/letters-only text found
// is_reliable set true if the returned Language is at least 2**30 times more
// probable then the second-best Language
//
// Return value: the most likely Language for the majority of the input text
// Length 0 input and text with no reliable letter sequences returns
// UNKNOWN_LANGUAGE
//
// Subsetting: For fast detection over large documents, these routines will
// scan non-tag text of the initial part of a document, then will
// skip 4-16 bytes and subsample text in the rest of the document, up to a
// fixed limit (currently 160KB of non-tag letters).
//
Language DetectLanguageSummaryV25(
const char* buffer,
int buffer_length,
bool is_plain_text,
const char* tld_hint, // "id" boosts Indonesian
int encoding_hint, // SJS boosts Japanese
Language language_hint, // ITALIAN boosts it
bool allow_extended_lang,
int flags,
Language plus_one,
Language* language3,
int* percent3,
double* normalized_score3,
int* text_bytes,
bool* is_reliable);
// For unit testing:
// Remove portions of text that have a high density of spaces, or that are
// overly repetitive, squeezing the remaining text in-place to the front
// of the input buffer.
// Return the new, possibly-shorter length
int CheapSqueezeInplace(char* isrc, int srclen, int ichunksize);
}; // End namespace CompactLangDetImpl
#endif // I18N_ENCODINGS_COMPACT_LANG_DET_COMPACT_LANG_DET_IMPL_H_
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/ext_lang_enc.h"
#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_macros.h"
#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_strtoint.h"
// Language names above NUM_LANGUAGES
// These are also the C enum declared names
static const char* const kExtLanguageName[] = {
"X_BORK_BORK_BORK", "X_PIG_LATIN", "X_HACKER", "X_KLINGON", "X_ELMER_FUDD",
// Pseudo-languages for Unicode scripts that express a single language
"X_OGHAM", "X_RUNIC", "X_YI", "X_OLD_ITALIC", "X_GOTHIC",
"X_DESERET", "X_HANUNOO", "X_BUHID", "X_TAGBANWA", "X_TAI_LE",
"X_LINEAR_B", "X_UGARITIC", "X_SHAVIAN", "X_OSMANYA", "X_CYPRIOT",
"X_BUGINESE", "X_COPTIC", "X_NEW_TAI_LUE", "X_GLAGOLITIC", "X_TIFINAGH",
"X_SYLOTI_NAGRI", "X_OLD_PERSIAN", "X_KHAROSHTHI", "X_BALINESE", "X_CUNEIFORM",
"X_PHOENICIAN", "X_PHAGS_PA", "X_NKO",
// Unicode 5.1
"X_SUDANESE", "X_LEPCHA", "X_OL_CHIKI", "X_VAI", "X_SAURASHTRA",
"X_KAYAH_LI", "X_REJANG", "X_LYCIAN", "X_CARIAN", "X_LYDIAN",
"X_CHAM",
};
// These are the C enum declared names, for programs creating C code
static const char* const kExtLangDeclaredName[] = {
"ENGLISH", /* 0 */
"DANISH", /* 1 */
"DUTCH", /* 2 */
"FINNISH", /* 3 */
"FRENCH", /* 4 */
"GERMAN", /* 5 */
"HEBREW", /* 6 */
"ITALIAN", /* 7 */
"JAPANESE", /* 8 */
"KOREAN", /* 9 */
"NORWEGIAN", /* 10 */
"POLISH", /* 11 */
"PORTUGUESE", /* 12 */
"RUSSIAN", /* 13 */
"SPANISH", /* 14 */
"SWEDISH", /* 15 */
"CHINESE", /* 16 */
"CZECH", /* 17 */
"GREEK", /* 18 */
"ICELANDIC", /* 19 */
"LATVIAN", /* 20 */
"LITHUANIAN", /* 21 */
"ROMANIAN", /* 22 */
"HUNGARIAN", /* 23 */
"ESTONIAN", /* 24 */
"TG_UNKNOWN_LANGUAGE", /* 25 */
"UNKNOWN_LANGUAGE", /* 26 */
"BULGARIAN", /* 27 */
"CROATIAN", /* 28 */
"SERBIAN", /* 29 */
"IRISH", /* 30 */
"GALICIAN", /* 31 */
"TAGALOG", /* 32 */
"TURKISH", /* 33 */
"UKRAINIAN", /* 34 */
"HINDI", /* 35 */
"MACEDONIAN", /* 36 */
"BENGALI", /* 37 */
"INDONESIAN", /* 38 */
"LATIN", /* 39 */
"MALAY", /* 40 */
"MALAYALAM", /* 41 */
"WELSH", /* 42 */
"NEPALI", /* 43 */
"TELUGU", /* 44 */
"ALBANIAN", /* 45 */
"TAMIL", /* 46 */
"BELARUSIAN", /* 47 */
"JAVANESE", /* 48 */
"OCCITAN", /* 49 */
"URDU", /* 50 */
"BIHARI", /* 51 */
"GUJARATI", /* 52 */
"THAI", /* 53 */
"ARABIC", /* 54 */
"CATALAN", /* 55 */
"ESPERANTO", /* 56 */
"BASQUE", /* 57 */
"INTERLINGUA", /* 58 */
"KANNADA", /* 59 */
"PUNJABI", /* 60 */
"SCOTS_GAELIC", /* 61 */
"SWAHILI", /* 62 */
"SLOVENIAN", /* 63 */
"MARATHI", /* 64 */
"MALTESE", /* 65 */
"VIETNAMESE", /* 66 */
"FRISIAN", /* 67 */
"SLOVAK", /* 68 */
"CHINESE_T", /* 69 */
"FAROESE", /* 70 */
"SUNDANESE", /* 71 */
"UZBEK", /* 72 */
"AMHARIC", /* 73 */
"AZERBAIJANI", /* 74 */
"GEORGIAN", /* 75 */
"TIGRINYA", /* 76 */
"PERSIAN", /* 77 */
"BOSNIAN", /* 78 */
"SINHALESE", /* 79 */
"NORWEGIAN_N", /* 80 */
"PORTUGUESE_P", /* 81 */
"PORTUGUESE_B", /* 82 */
"XHOSA", /* 83 */
"ZULU", /* 84 */
"GUARANI", /* 85 */
"SESOTHO", /* 86 */
"TURKMEN", /* 87 */
"KYRGYZ", /* 88 */
"BRETON", /* 89 */
"TWI", /* 90 */
"YIDDISH", /* 91 */
"SERBO_CROATIAN", /* 92 */
"SOMALI", /* 93 */
"UIGHUR", /* 94 */
"KURDISH", /* 95 */
"MONGOLIAN", /* 96 */
"ARMENIAN", /* 97 */
"LAOTHIAN", /* 98 */
"SINDHI", /* 99 */
"RHAETO_ROMANCE", /* 100 */
"AFRIKAANS", /* 101 */
"LUXEMBOURGISH", /* 102 */
"BURMESE", /* 103 */
"KHMER", /* 104 */
"TIBETAN", /* 105 */
"DHIVEHI", /* 106 */ // sometimes spelled Divehi; lang of Maldives
"CHEROKEE", /* 107 */
"SYRIAC", /* 108 */
"LIMBU", /* 109 */
"ORIYA", /* 110 */
"ASSAMESE", /* 111 */
"CORSICAN", /* 112 */
"INTERLINGUE", /* 113 */
"KAZAKH", /* 114 */
"LINGALA", /* 115 */
"MOLDAVIAN", /* 116 */
"PASHTO", /* 117 */
"QUECHUA", /* 118 */
"SHONA", /* 119 */
"TAJIK", /* 120 */
"TATAR", /* 121 */
"TONGA", /* 122 */
"YORUBA", /* 123 */
"CREOLES_AND_PIDGINS_ENGLISH_BASED", /* 124 */
"CREOLES_AND_PIDGINS_FRENCH_BASED", /* 125 */
"CREOLES_AND_PIDGINS_PORTUGUESE_BASED", /* 126 */
"CREOLES_AND_PIDGINS_OTHER", /* 127 */
"MAORI", /* 128 */
"WOLOF", /* 129 */
"ABKHAZIAN", /* 130 */
"AFAR", /* 131 */
"AYMARA", /* 132 */
"BASHKIR", /* 133 */
"BISLAMA", /* 134 */
"DZONGKHA", /* 135 */
"FIJIAN", /* 136 */
"GREENLANDIC", /* 137 */
"HAUSA", /* 138 */
"HAITIAN_CREOLE", /* 139 */
"INUPIAK", /* 140 */
"INUKTITUT", /* 141 */
"KASHMIRI", /* 142 */
"KINYARWANDA", /* 143 */
"MALAGASY", /* 144 */
"NAURU", /* 145 */
"OROMO", /* 146 */
"RUNDI", /* 147 */
"SAMOAN", /* 148 */
"SANGO", /* 149 */
"SANSKRIT", /* 150 */
"SISWANT", /* 151 */
"TSONGA", /* 152 */
"TSWANA", /* 153 */
"VOLAPUK", /* 154 */
"ZHUANG", /* 155 */
"KHASI", /* 156 */
"SCOTS", /* 157 */
"GANDA", /* 158 */
"MANX", /* 159 */
"MONTENEGRIN", /* 160 */
// Add new language declared names just before here
};
COMPILE_ASSERT(arraysize(kExtLangDeclaredName) == NUM_LANGUAGES,
kExtLangDeclaredName_has_incorrect_length);
// Language codes above NUM_LANGUAGES
// I made all these up, except Klingon from ISO-639-2
// NOTE: zza is a standard name
static const char* const kExtLanguageCode[] = {
// "X_BORK_BORK_BORK", "X_PIG_LATIN", "X_HACKER", "X_KLINGON", "X_ELMER_FUDD",
// All Latin script
"zzb", "zzp", "zzh", "tlh", "zze",
// Pseudo-languages for Unicode scripts that express a single language
"xx-Ogam", "xx-Runr", "xx-Yiii", "xx-Ital", "xx-Goth",
"xx-Dsrt", "xx-Hano", "xx-Buhd", "xx-Tagb", "xx-Tale",
"xx-Linb", "xx-Ugar", "xx-Shaw", "xx-Osma", "xx-Cprt",
"xx-Bugi", "xx-Copt", "xx-Talu", "xx-Glag", "xx-Tfng",
"xx-Sylo", "xx-Xpeo", "xx-Khar", "xx-Bali", "xx-Xsux",
"xx-Phnx", "xx-Phag", "xx-Nkoo",
// Unicode 5.1
"xx-Sund", "xx-Lepc", "xx-Olck", "xx-Vaii", "xx-Saur",
"xx-Kali", "xx-Rjng", "xx-Lyci", "xx-Cari", "xx-Lydi",
"xx-Cham",
};
// Given the Language, returns its string name used as the output by
// the lang/enc identifier, e.g. "Korean"
// "invalid_language" if the input is invalid.
// TG_UNKNOWN_LANGUAGE is used as a placeholder for the "ignore me" language,
// used to subtract out HTML, link farms, DNA strings, and alittle English porn
const char* ExtLanguageName(const Language lang) {
if (lang < 0) {
// No-text-at-all result from a Tote
return "";
}
// CompactLanguageDetect extension
if (lang == TG_UNKNOWN_LANGUAGE) {
return "Ignore";
}
if ((0 <= lang) && (lang < NUM_LANGUAGES)) {
return LanguageName(lang);
}
if ((EXT_LANGUAGE_BASE <= lang) && (lang < EXT_NUM_LANGUAGES)) {
return kExtLanguageName[lang - EXT_LANGUAGE_BASE];
}
return invalid_language_name();
}
// Given the Language, returns its Language enum spelling, for use by
// programs that create C declarations, e.g. "KOREAN"
// "UNKNOWN_LANGUAGE" if the input is invalid.
const char* ExtLanguageDeclaredName(const Language lang) {
if ((0 <= lang) && (lang < NUM_LANGUAGES)) {
return kExtLangDeclaredName[lang];
}
if ((EXT_LANGUAGE_BASE <= lang) && (lang < EXT_NUM_LANGUAGES)) {
return kExtLanguageName[lang - EXT_LANGUAGE_BASE];
}
return "UNKNOWN_LANGUAGE";
}
// Given the Language, return the language code, e.g. "ko"
const char* ExtLanguageCode(const Language lang) {
// Hack for ignore/porn pseudo-language
if (lang == TG_UNKNOWN_LANGUAGE) {
return "xxx";
}
if ((0 <= lang) && (lang < NUM_LANGUAGES)) {
return LanguageCode(lang);
}
if ((EXT_LANGUAGE_BASE <= lang) && (lang < EXT_NUM_LANGUAGES)) {
return kExtLanguageCode[lang - EXT_LANGUAGE_BASE];
}
return "??";
}
// Convert "en-Latn-GB" to ENGLISH
// Normalize to PORTUGUESE, not PORTUGUESE_B nor PORTUGUESE_P
// Consider for later: NORWEGIAN, NORWEGIAN_N
// Consider for later: SCOTS, SCOTS_GAELIC
// Consider for later: SERBO_CROATIAN, SERBIAN, CROATIAN, BOSNIAN
//
Language GetLanguageFromNumberOrName(const char* src) {
if (strspn(src, "0123456789") == strlen(src)) {
// All digits
return static_cast<Language>(strto32(src, NULL, 10));
}
Language retlang = UNKNOWN_LANGUAGE;
size_t len = strlen(src);
if (true /*FLAGS_mergepairs*/) {
// Merge sets of langauges pt-xx en-xx fr-xx, NOT bs/hr/sr
if (memcmp(src, "pt-", 3) == 0) {return PORTUGUESE;}
if (memcmp(src, "en-", 3) == 0) {return ENGLISH;}
if (memcmp(src, "fr-", 3) == 0) {return FRENCH;}
// Use NormalizeLanguage instead
if (memcmp(src, "bs-", 3) == 0) {return CROATIAN;}
if (memcmp(src, "hr-", 3) == 0) {return CROATIAN;}
if (memcmp(src, "sr-Latn", 7) == 0) {return CROATIAN;}
if (memcmp(src, "sh-Latn", 7) == 0) {return CROATIAN;}
if (memcmp(src, "sr-Cyrl", 7) == 0) {return SERBIAN;}
if (memcmp(src, "sh-Cyrl", 7) == 0) {return SERBIAN;}
}
// Extensions
if (len >= 3) {
// Standin for ignore/porn "language"
if (memcmp(src, "xxx", 3) == 0) {return TG_UNKNOWN_LANGUAGE;}
if (memcmp(src, "zzb", 3) == 0) {return X_BORK_BORK_BORK;}
if (memcmp(src, "zzp", 3) == 0) {return X_PIG_LATIN;}
if (memcmp(src, "zzh", 3) == 0) {return X_HACKER;}
if (memcmp(src, "tlh", 3) == 0) {return X_KLINGON;}
if (memcmp(src, "zze", 3) == 0) {return X_ELMER_FUDD;}
}
// We have a name like en-Latn-GB or pt-BR
// First, get rid of some special cases
if (len <= 3) {
LanguageFromCode(src, &retlang);
} else if (len == 7) {
// More Extensions
if (memcmp(src, "xx-", 3) == 0) {
if (memcmp(src, "xx-Ogam", 7) == 0) {return X_OGHAM;}
if (memcmp(src, "xx-Runr", 7) == 0) {return X_RUNIC;}
if (memcmp(src, "xx-Yiii", 7) == 0) {return X_YI;}
if (memcmp(src, "xx-Ital", 7) == 0) {return X_OLD_ITALIC;}
if (memcmp(src, "xx-Goth", 7) == 0) {return X_GOTHIC;}
if (memcmp(src, "xx-Dsrt", 7) == 0) {return X_DESERET;}
if (memcmp(src, "xx-Hano", 7) == 0) {return X_HANUNOO;}
if (memcmp(src, "xx-Buhd", 7) == 0) {return X_BUHID;}
if (memcmp(src, "xx-Tagb", 7) == 0) {return X_TAGBANWA;}
if (memcmp(src, "xx-Tale", 7) == 0) {return X_TAI_LE;}
if (memcmp(src, "xx-Linb", 7) == 0) {return X_LINEAR_B;}
if (memcmp(src, "xx-Ugar", 7) == 0) {return X_UGARITIC;}
if (memcmp(src, "xx-Shaw", 7) == 0) {return X_SHAVIAN;}
if (memcmp(src, "xx-Osma", 7) == 0) {return X_OSMANYA;}
if (memcmp(src, "xx-Cprt", 7) == 0) {return X_CYPRIOT;}
if (memcmp(src, "xx-Bugi", 7) == 0) {return X_BUGINESE;}
if (memcmp(src, "xx-Copt", 7) == 0) {return X_COPTIC;}
if (memcmp(src, "xx-Talu", 7) == 0) {return X_NEW_TAI_LUE;}
if (memcmp(src, "xx-Glag", 7) == 0) {return X_GLAGOLITIC;}
if (memcmp(src, "xx-Tfng", 7) == 0) {return X_TIFINAGH;}
if (memcmp(src, "xx-Sylo", 7) == 0) {return X_SYLOTI_NAGRI;}
if (memcmp(src, "xx-Xpeo", 7) == 0) {return X_OLD_PERSIAN;}
if (memcmp(src, "xx-Khar", 7) == 0) {return X_KHAROSHTHI;}
if (memcmp(src, "xx-Bali", 7) == 0) {return X_BALINESE;}
if (memcmp(src, "xx-Xsux", 7) == 0) {return X_CUNEIFORM;}
if (memcmp(src, "xx-Phnx", 7) == 0) {return X_PHOENICIAN;}
if (memcmp(src, "xx-Phag", 7) == 0) {return X_PHAGS_PA;}
if (memcmp(src, "xx-Nkoo", 7) == 0) {return X_NKO;}
// Unicode 5.1
if (memcmp(src, "xx-Sund", 7) == 0) {return X_SUDANESE;}
if (memcmp(src, "xx-Lepc", 7) == 0) {return X_LEPCHA;}
if (memcmp(src, "xx-Olck", 7) == 0) {return X_OL_CHIKI;}
if (memcmp(src, "xx-Vaii", 7) == 0) {return X_VAI;}
if (memcmp(src, "xx-Saur", 7) == 0) {return X_SAURASHTRA;}
if (memcmp(src, "xx-Kali", 7) == 0) {return X_KAYAH_LI;}
if (memcmp(src, "xx-Rjng", 7) == 0) {return X_REJANG;}
if (memcmp(src, "xx-Lyci", 7) == 0) {return X_LYCIAN;}
if (memcmp(src, "xx-Cari", 7) == 0) {return X_CARIAN;}
if (memcmp(src, "xx-Lydi", 7) == 0) {return X_LYDIAN;}
if (memcmp(src, "xx-Cham", 7) == 0) {return X_CHAM;}
}
}
// Some other weird ones
// Could be Latn or Limb; all our current training data is Latn
if (strcmp(src, "sit-NP") == 0) {return LIMBU;}
if (strcmp(src, "un-Latn") == 0) {return UNKNOWN_LANGUAGE;}
// Multi-country langauges
if (memcmp(src, "zh", 2) == 0) {
if (memcmp(&src[len - 2], "TW", 2) == 0) {return CHINESE_T;}
if (memcmp(&src[len - 2], "HK", 2) == 0) {return CHINESE_T;}
return CHINESE;
}
if (memcmp(src, "pt", 2) == 0) {
if (memcmp(&src[len - 2], "BR", 2) == 0) {return PORTUGUESE;}
return PORTUGUESE;
}
if (memcmp(src, "fr", 2) == 0) {
if (memcmp(&src[len -2], "CA", 2) == 0) {return FRENCH;}
return FRENCH;
}
// None of the special cases matched
if (src[2] == '-') {
char temp[4];
memcpy(temp, src, 4);
temp[2] = '\0';
LanguageFromCode(temp, &retlang);
}
if (src[3] == '-') {
char temp[4];
memcpy(temp, src, 4);
temp[3] = '\0';
LanguageFromCode(temp, &retlang);
}
if (retlang != UNKNOWN_LANGUAGE) {
return retlang;
}
return retlang;
}
typedef struct {
const char* name;
UnicodeLScript lscript;
} NameScriptPair;
// In alphabetic order for binary search
static const NameScriptPair kNameScriptPair[] = {
// Unicode 5.1 additional scripts
{"Arab", ULScript_Arabic},
{"Armn", ULScript_Armenian},
{"Bali", ULScript_Balinese},
{"Beng", ULScript_Bengali},
{"Bugi", ULScript_Buginese},
{"Buhd", ULScript_Buhid},
{"Cans", ULScript_Canadian_Aboriginal},
{"Cari", ULScript_Carian}, // Unicode 5.1
{"Cham", ULScript_Cham}, // Unicode 5.1
{"Cher", ULScript_Cherokee},
{"Copt", ULScript_Coptic},
{"Cprt", ULScript_Cypriot},
{"Cyrl", ULScript_Cyrillic},
{"Deva", ULScript_Devanagari},
{"Dsrt", ULScript_Deseret},
{"Ethi", ULScript_Ethiopic},
{"Geor", ULScript_Georgian},
{"Glag", ULScript_Glagolitic},
{"Goth", ULScript_Gothic},
{"Grek", ULScript_Greek},
{"Gujr", ULScript_Gujarati},
{"Guru", ULScript_Gurmukhi},
{"Hani", ULScript_HanCJK},
{"Hano", ULScript_Hanunoo},
{"Hebr", ULScript_Hebrew},
{"Ital", ULScript_Old_Italic},
{"Kali", ULScript_Kayah_Li}, // Unicode 5.1
{"Khar", ULScript_Kharoshthi},
{"Khmr", ULScript_Khmer},
{"Knda", ULScript_Kannada},
{"Laoo", ULScript_Lao},
{"Latn", ULScript_Latin},
{"Lepc", ULScript_Lepcha}, // Unicode 5.1
{"Limb", ULScript_Limbu},
{"Linb", ULScript_Linear_B},
{"Lyci", ULScript_Lycian}, // Unicode 5.1
{"Lydi", ULScript_Lydian}, // Unicode 5.1
{"Mlym", ULScript_Malayalam},
{"Mong", ULScript_Mongolian},
{"Mymr", ULScript_Myanmar},
{"Nkoo", ULScript_Nko},
{"Ogam", ULScript_Ogham},
{"Olck", ULScript_Ol_Chiki}, // Unicode 5.1
{"Orya", ULScript_Oriya},
{"Osma", ULScript_Osmanya},
{"Phag", ULScript_Phags_Pa},
{"Phnx", ULScript_Phoenician},
{"Rjng", ULScript_Rejang}, // Unicode 5.1
{"Runr", ULScript_Runic},
{"Saur", ULScript_Saurashtra}, // Unicode 5.1
{"Shaw", ULScript_Shavian},
{"Sinh", ULScript_Sinhala},
{"Sund", ULScript_Sundanese}, // Unicode 5.1
{"Sylo", ULScript_Syloti_Nagri},
{"Syrc", ULScript_Syriac},
{"Tagb", ULScript_Tagbanwa},
{"Tale", ULScript_Tai_Le},
{"Talu", ULScript_New_Tai_Lue},
{"Taml", ULScript_Tamil},
{"Telu", ULScript_Telugu},
{"Tfng", ULScript_Tifinagh},
{"Tglg", ULScript_Tagalog},
{"Thaa", ULScript_Thaana},
{"Thai", ULScript_Thai},
{"Tibt", ULScript_Tibetan},
{"Ugar", ULScript_Ugaritic},
{"Vaii", ULScript_Vai}, // Unicode 5.1 // NOTE: apparently 'Vai '
{"Xpeo", ULScript_Old_Persian},
{"Xsux", ULScript_Cuneiform},
{"Yiii", ULScript_Yi},
{"Zyyy", ULScript_Common},
{"Zzzz", ULScript_Inherited},
};
// Convert "en-Latn-GB" to ULScript_Latin
UnicodeLScript GetLScriptFromNumberOrName(const char* src) {
if (strspn(src, "0123456789") == strlen(src)) {
// All digits
return static_cast<UnicodeLScript>(strto32(src, NULL, 10));
}
if (strcmp(src, "zh-TW") == 0) {return ULScript_HanCJK;}
if (strcmp(src, "zh-CN") == 0) {return ULScript_HanCJK;}
if (strcmp(src, "pt-BR") == 0) {return ULScript_Latin;}
if (strcmp(src, "pt-PT") == 0) {return ULScript_Latin;}
// Could be Latn or Limb; all our current training data is Latn
if (strcmp(src, "sit-NP") == 0) {return ULScript_Latin;}
// Isolate just the script field
char temp[5];
const char* src2 = strchr(src, '-');
if (src2 == NULL) {return ULScript_Latin;}
src2 += 1; // over the -
memcpy(temp, src2, 4);
temp[4] = '\0';
int lo = 0;
int hi = ULScript_NUM_SCRIPTS;
while (lo < hi) {
int mid = (lo + hi) >> 1;
if (strcmp(temp, kNameScriptPair[mid].name) < 0) {
hi = mid;
} else if (strcmp(temp, kNameScriptPair[mid].name) > 0) {
lo = mid + 1;
} else {
return kNameScriptPair[mid].lscript;
}
}
return ULScript_Latin;
}
// Merge together some languages, such as bo/hr/sr
// Croatian Latin and Serbian Cyrillic now.
Language NormalizeLanguage(Language lang) {
if (lang == BOSNIAN) {return CROATIAN;}
if (lang == SERBO_CROATIAN) {return SERBIAN;}
if (lang == PORTUGUESE_P) {return PORTUGUESE;}
if (lang == PORTUGUESE_B) {return PORTUGUESE;}
return lang;
}
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef I18N_ENCODINGS_COMPACT_LANG_DET_EXT_LANG_ENC_H__
#define I18N_ENCODINGS_COMPACT_LANG_DET_EXT_LANG_ENC_H__
#include "third_party/cld/bar/toolbar/cld/i18n/languages/public/languages.h"
#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/letterscript_enum.h"
// Leave a small gap after the base languages, so adding one or two is easy.
// Just reduce the gap here (currently 5 entries)
#define EXT_LANGUAGE_BASE (NUM_LANGUAGES + 4)
// Google UI languages
#define X_BORK_BORK_BORK (Language)(EXT_LANGUAGE_BASE+0)
#define X_PIG_LATIN (Language)(EXT_LANGUAGE_BASE+1)
#define X_HACKER (Language)(EXT_LANGUAGE_BASE+2)
#define X_KLINGON (Language)(EXT_LANGUAGE_BASE+3)
#define X_ELMER_FUDD (Language)(EXT_LANGUAGE_BASE+4)
// Pseudo-languages for Unicode scripts that express a single language
#define X_OGHAM (Language)(EXT_LANGUAGE_BASE+5)
#define X_RUNIC (Language)(EXT_LANGUAGE_BASE+6)
#define X_YI (Language)(EXT_LANGUAGE_BASE+7)
#define X_OLD_ITALIC (Language)(EXT_LANGUAGE_BASE+8)
#define X_GOTHIC (Language)(EXT_LANGUAGE_BASE+9)
#define X_DESERET (Language)(EXT_LANGUAGE_BASE+10)
#define X_HANUNOO (Language)(EXT_LANGUAGE_BASE+11)
#define X_BUHID (Language)(EXT_LANGUAGE_BASE+12)
#define X_TAGBANWA (Language)(EXT_LANGUAGE_BASE+13)
#define X_TAI_LE (Language)(EXT_LANGUAGE_BASE+14)
#define X_LINEAR_B (Language)(EXT_LANGUAGE_BASE+15)
#define X_UGARITIC (Language)(EXT_LANGUAGE_BASE+16)
#define X_SHAVIAN (Language)(EXT_LANGUAGE_BASE+17)
#define X_OSMANYA (Language)(EXT_LANGUAGE_BASE+18)
#define X_CYPRIOT (Language)(EXT_LANGUAGE_BASE+19)
#define X_BUGINESE (Language)(EXT_LANGUAGE_BASE+20)
#define X_COPTIC (Language)(EXT_LANGUAGE_BASE+21)
#define X_NEW_TAI_LUE (Language)(EXT_LANGUAGE_BASE+22)
#define X_GLAGOLITIC (Language)(EXT_LANGUAGE_BASE+23)
#define X_TIFINAGH (Language)(EXT_LANGUAGE_BASE+24)
#define X_SYLOTI_NAGRI (Language)(EXT_LANGUAGE_BASE+25)
#define X_OLD_PERSIAN (Language)(EXT_LANGUAGE_BASE+26)
#define X_KHAROSHTHI (Language)(EXT_LANGUAGE_BASE+27)
#define X_BALINESE (Language)(EXT_LANGUAGE_BASE+28)
#define X_CUNEIFORM (Language)(EXT_LANGUAGE_BASE+29)
#define X_PHOENICIAN (Language)(EXT_LANGUAGE_BASE+30)
#define X_PHAGS_PA (Language)(EXT_LANGUAGE_BASE+31)
#define X_NKO (Language)(EXT_LANGUAGE_BASE+32)
// Unicode 5.1
#define X_SUDANESE (Language)(EXT_LANGUAGE_BASE+33)
#define X_LEPCHA (Language)(EXT_LANGUAGE_BASE+34)
#define X_OL_CHIKI (Language)(EXT_LANGUAGE_BASE+35)
#define X_VAI (Language)(EXT_LANGUAGE_BASE+36)
#define X_SAURASHTRA (Language)(EXT_LANGUAGE_BASE+37)
#define X_KAYAH_LI (Language)(EXT_LANGUAGE_BASE+38)
#define X_REJANG (Language)(EXT_LANGUAGE_BASE+39)
#define X_LYCIAN (Language)(EXT_LANGUAGE_BASE+40)
#define X_CARIAN (Language)(EXT_LANGUAGE_BASE+41)
#define X_LYDIAN (Language)(EXT_LANGUAGE_BASE+42)
#define X_CHAM (Language)(EXT_LANGUAGE_BASE+43)
#define EXT_NUM_LANGUAGES (Language)(EXT_LANGUAGE_BASE+44)
// ExtLanguageName
// ------------
// Given the Language, returns its string name used as the output by
// the lang/enc identifier, e.g. "Korean"
// "invalid_language" if the input is invalid.
extern const char* ExtLanguageName(const Language lang);
// ExtLanguageDeclaredName
// ------------
// Given the Language, returns its Language enum spelling, for use by
// programs that create C declarations, e.g. "KOREAN"
// "UNKNOWN_LANGUAGE" if the input is invalid.
extern const char* ExtLanguageDeclaredName(const Language lang);
// ExtLanguageCode
// ------------
// Given the Language, return the language code, e.g. "ko"
// This is determined by
// the following (in order of preference):
// - ISO-639-1 two-letter language code
// (all except those mentioned below)
// - ISO-639-2 three-letter bibliographic language code
// (Tibetan, Dhivehi, Cherokee, Syriac)
// - Google-specific language code
// (ChineseT ("zh-TW"), Teragram Unknown, Unknown,
// Portuguese-Portugal, Portuguese-Brazil, Limbu)
extern const char * ExtLanguageCode(const Language lang);
// Convert "en-Latn-GB" to ENGLISH
// Normalize to PORTUGUESE, not PORTUGUESE_B nor PORTUGUESE_P
// Consider for later: NORWEGIAN, NORWEGIAN_N
// Consider for later: SCOTS, SCOTS_GAELIC
// Consider for later: SERBO_CROATIAN, SERBIAN, CROATIAN, BOSNIAN
//
Language GetLanguageFromNumberOrName(const char* src);
// Convert "en-Latn-GB" to ULScript_Latin
UnicodeLScript GetLScriptFromNumberOrName(const char* src);
// Merge together some languages, such as bo/hr/sr
Language NormalizeLanguage(Language lang);
#endif // I18N_ENCODINGS_COMPACT_LANG_DET_EXT_LANG_ENC_H__
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment