From 8dc5a205c581c3adf5aeebf6d4ccb4a9c8176658 Mon Sep 17 00:00:00 2001
From: "jshin@chromium.org"
 <jshin@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>
Date: Tue, 19 Jan 2010 21:32:42 +0000
Subject: [PATCH] Reland r36541 (which went in without any commit log by some
 magic). Was reverted in r36550. Now I'm relanding with the full description.

Port back CLD to Linux and Mac by replacing Windows API calls with ICU's equivalent APIs for normalization.

I also fixed bug 23553 (Traditional Chinese is not detected) by calling LanguageCode instead of LanguageCode_ISO_639_1. The latter covers only ISO 639-1, but there are languages detected by CLD not covered by ISO 639-1. In that case, ISO 639-2 is used. In case even ISO 639-2 does not cover (e.g. Traditional Chinese), another fallback is taken by LanguageCode.

The html file for CLD testing (french_sentence.html) is explicitly labelled with charset=ISO-8859-1.

Original Review: http://codereview.chromium.org/523108

BUG=25206,23553
TEST=1. CLD is built on Linux/Mac
     2. The following test pass:
       - unit_tests: Extension*.DetectTabLang* and CompactLangDet*.*
       - browser_tests: ExtensionBrowserTest.Toolstrip
     3. Install the 'cld extension' in chrome/common/extensions/docs/examples/api/i18n/cld and go to http://news.google.com.tw and 'zh-TW' shows up in the language badge at the upper right (upper-left in he/ar Chrome) corner.
TBR=jcampan







Review URL: http://codereview.chromium.org/545123

TBR=jshin@chromium.org
Review URL: http://codereview.chromium.org/551070

TBR=jshin@chromium.org
Review URL: http://codereview.chromium.org/549091

git-svn-id: svn://svn.chromium.org/chrome/trunk/src@36552 0039d316-1c4b-4281-b951-d872f2087c98
---
 build/all.gyp                                 |   2 +-
 .../extensions/extension_browsertests_misc.cc |   8 +-
 .../extensions/extension_tabs_module.cc       |   5 -
 chrome/chrome_browser.gypi                    |   2 -
 chrome/chrome_renderer.gypi                   |   6 +-
 chrome/chrome_tests.gypi                      |   6 +-
 .../extension_api_client_unittest.cc          |   2 -
 chrome/renderer/render_view.cc                |  23 +-
 chrome/renderer/render_view.h                 |  12 +-
 .../1.0.0.0/french_sentence.html              |   1 +
 .../compact_lang_det/win/cld_scopedptr.h      |   7 +-
 .../compact_lang_det/win/cld_unicodetext.cc   | 108 +++-----
 .../compact_lang_det/win/cld_unicodetext.h    |   7 +-
 third_party/cld/base/string_util.h            |   6 +
 third_party/cld/cld.gyp                       | 238 +++++++++---------
 15 files changed, 190 insertions(+), 243 deletions(-)

diff --git a/build/all.gyp b/build/all.gyp
index c063af25e7a4e..8a0910b813f98 100644
--- a/build/all.gyp
+++ b/build/all.gyp
@@ -21,6 +21,7 @@
         '../testing/gmock.gyp:*',
         '../testing/gtest.gyp:*',
         '../third_party/bzip2/bzip2.gyp:*',
+        '../third_party/cld/cld.gyp:*',
         '../third_party/codesighs/codesighs.gyp:*',
         '../third_party/ffmpeg/ffmpeg.gyp:*',
         '../third_party/icu/icu.gyp:*',
@@ -98,7 +99,6 @@
             '../sandbox/sandbox.gyp:*',
             '../third_party/bsdiff/bsdiff.gyp:*',
             '../third_party/bspatch/bspatch.gyp:*',
-            '../third_party/cld/cld.gyp:*',
             '../third_party/gles2_book/gles2_book.gyp:*',
             '../tools/memory_watcher/memory_watcher.gyp:*',
           ],
diff --git a/chrome/browser/extensions/extension_browsertests_misc.cc b/chrome/browser/extensions/extension_browsertests_misc.cc
index ee5935c56b010..d0409aaac88a4 100644
--- a/chrome/browser/extensions/extension_browsertests_misc.cc
+++ b/chrome/browser/extensions/extension_browsertests_misc.cc
@@ -89,23 +89,17 @@ IN_PROC_BROWSER_TEST_F(ExtensionBrowserTest, Toolstrip) {
       host->render_view_host(), L"", L"testTabsAPI()", &result);
   EXPECT_TRUE(result);
 
-#if defined(OS_WIN)
-  // http://crbug.com/29896 - tabs.detectLanguage is Windows only
-
   // Test for compact language detection API. First navigate to a (static) html
   // file with a French sentence. Then, run the test API in toolstrip1.html to
   // actually call the language detection API through the existing extension,
   // and verify that the language returned is indeed French.
   FilePath language_url = extension_test_data_dir.AppendASCII(
       "french_sentence.html");
-  ui_test_utils::NavigateToURL(
-      browser(),
-      GURL(language_url.ToWStringHack()));
+  ui_test_utils::NavigateToURL(browser(), net::FilePathToFileURL(language_url));
 
   ui_test_utils::ExecuteJavaScriptAndExtractBool(
       host->render_view_host(), L"", L"testTabsLanguageAPI()", &result);
   EXPECT_TRUE(result);
-#endif
 }
 
 IN_PROC_BROWSER_TEST_F(ExtensionBrowserTest, ExtensionViews) {
diff --git a/chrome/browser/extensions/extension_tabs_module.cc b/chrome/browser/extensions/extension_tabs_module.cc
index 19d268727f2fb..8e6bf2f8b7e97 100644
--- a/chrome/browser/extensions/extension_tabs_module.cc
+++ b/chrome/browser/extensions/extension_tabs_module.cc
@@ -832,11 +832,6 @@ void CaptureVisibleTabFunction::SendResultFromBitmap(
 }
 
 bool DetectTabLanguageFunction::RunImpl() {
-  #if !defined(OS_WIN)
-    error_ = keys::kSupportedInWindowsOnlyError;
-    return false;
-  #endif
-
   int tab_id = 0;
   Browser* browser = NULL;
   TabContents* contents = NULL;
diff --git a/chrome/chrome_browser.gypi b/chrome/chrome_browser.gypi
index fb7d2de452587..666ecf8926c82 100755
--- a/chrome/chrome_browser.gypi
+++ b/chrome/chrome_browser.gypi
@@ -2124,12 +2124,10 @@
           ],
           'include_dirs': [
             'third_party/wtl/include',
-            '../third_party/cld',
           ],
           'dependencies': [
             '../gears/gears.gyp:gears',
             '../google_update/google_update.gyp:google_update',
-            '../third_party/cld/cld.gyp:cld',
             '../views/views.gyp:views',
             'installer/installer.gyp:installer_util',
             '<(allocator_target)',
diff --git a/chrome/chrome_renderer.gypi b/chrome/chrome_renderer.gypi
index baac65152328f..ea95478c6d0ed 100755
--- a/chrome/chrome_renderer.gypi
+++ b/chrome/chrome_renderer.gypi
@@ -16,6 +16,7 @@
         '../printing/printing.gyp:printing',
         '../skia/skia.gyp:skia',
         '../third_party/hunspell/hunspell.gyp:hunspell',
+        '../third_party/cld/cld.gyp:cld',
         '../third_party/icu/icu.gyp:icui18n',
         '../third_party/icu/icu.gyp:icuuc',
         '../third_party/npapi/npapi.gyp:npapi',
@@ -27,6 +28,7 @@
       ],
       'include_dirs': [
         '..',
+        '../third_party/cld',
       ],
       'defines': [
         '<@(nacl_defines)',
@@ -168,12 +170,8 @@
         # Windows-specific rules.
         ['OS=="win"', {
           'include_dirs': [
-            '../third_party/cld',
             'third_party/wtl/include',
           ],
-          'dependencies': [
-            '../third_party/cld/cld.gyp:cld',
-          ],
           'conditions': [
             ['win_use_allocator_shim==1', {
               'dependencies': [
diff --git a/chrome/chrome_tests.gypi b/chrome/chrome_tests.gypi
index b56840a6d8efd..7ed6e6adc31d4 100755
--- a/chrome/chrome_tests.gypi
+++ b/chrome/chrome_tests.gypi
@@ -492,6 +492,7 @@
         '../testing/gmock.gyp:gmock',
         '../testing/gtest.gyp:gtest',
         '../third_party/bzip2/bzip2.gyp:bzip2',
+        '../third_party/cld/cld.gyp:cld',
         '../third_party/icu/icu.gyp:icui18n',
         '../third_party/icu/icu.gyp:icuuc',
         '../third_party/libxml/libxml.gyp:libxml',
@@ -896,8 +897,6 @@
           ],
           'sources!': [
             'browser/views/bookmark_context_menu_test.cc',
-            # Compact Language Detection (cld) is not supported in linux yet.
-            '../third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/compact_lang_det_unittest_small.cc',
           ],
         }],
         ['OS=="linux" and (toolkit_views==1 or chromeos==1)', {
@@ -947,9 +946,6 @@
             'browser/tab_contents/navigation_controller_unittest.cc',
             'browser/task_manager_unittest.cc',
             '../third_party/hunspell/google/hunspell_tests.cc',
-
-            # Compact Language Detection (cld) is not supported in mac yet.
-            '../third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/compact_lang_det_unittest_small.cc',
           ],
           # TODO(mark): We really want this for all non-static library targets,
           # but when we tried to pull it up to the common.gypi level, it broke
diff --git a/chrome/renderer/extensions/extension_api_client_unittest.cc b/chrome/renderer/extensions/extension_api_client_unittest.cc
index 05c2ddff83fef..39d275f87b4ab 100644
--- a/chrome/renderer/extensions/extension_api_client_unittest.cc
+++ b/chrome/renderer/extensions/extension_api_client_unittest.cc
@@ -290,7 +290,6 @@ TEST_F(ExtensionAPIClientTest, GetTab) {
                "tabs.get", "2");
 }
 
-#if defined(OS_WIN)
 TEST_F(ExtensionAPIClientTest, DetectTabLanguage) {
   ExpectJsFail("chrome.tabs.detectLanguage(32, function(){}, 20);",
                "Uncaught Error: Too many arguments.");
@@ -306,7 +305,6 @@ TEST_F(ExtensionAPIClientTest, DetectTabLanguage) {
   ExpectJsPass("chrome.tabs.detectLanguage(null, function(){})",
                "tabs.detectLanguage", "null");
 }
-#endif
 
 TEST_F(ExtensionAPIClientTest, GetSelectedTab) {
   ExpectJsFail("chrome.tabs.getSelected(32, function(){}, 20);",
diff --git a/chrome/renderer/render_view.cc b/chrome/renderer/render_view.cc
index 5b1ee822b81b3..dcbda82289872 100644
--- a/chrome/renderer/render_view.cc
+++ b/chrome/renderer/render_view.cc
@@ -67,10 +67,7 @@
 #include "net/base/net_errors.h"
 #include "skia/ext/bitmap_platform_device.h"
 #include "skia/ext/image_operations.h"
-#if defined(OS_WIN)
-// TODO(port): The compact language detection library works only for Windows.
 #include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.h"
-#endif
 #include "third_party/WebKit/WebKit/chromium/public/WebAccessibilityCache.h"
 #include "third_party/WebKit/WebKit/chromium/public/WebAccessibilityObject.h"
 #include "third_party/WebKit/WebKit/chromium/public/WebCString.h"
@@ -223,7 +220,7 @@ static const char* const kUnreachableWebDataURL =
 static const char* const kBackForwardNavigationScheme = "history";
 
 // The string returned in DetectLanguage if we failed to detect the language.
-static const char* const kUnknownLanguageCode = "unknown";
+static const char* const kUnknownLanguageCode = "und";
 
 static void GetRedirectChain(WebDataSource* ds, std::vector<GURL>* result) {
   WebVector<WebURL> urls;
@@ -3082,31 +3079,29 @@ std::string RenderView::DetectLanguage() {
   if (!webview() || is_loading_)
     return kUnknownLanguageCode;
 
-  std::string language = kUnknownLanguageCode;
-#if defined(OS_WIN)  // CLD is only available on Windows at this time.
   WebFrame* main_frame = webview()->mainFrame();
   std::wstring contents;
   CaptureText(main_frame, &contents);
-  language = DetermineTextLanguage(contents);
-#endif
-
-  return language;
+  return DetermineTextLanguage(contents);
 }
 
 // static
 std::string RenderView::DetermineTextLanguage(const std::wstring& text) {
   std::string language = kUnknownLanguageCode;
-#if defined(OS_WIN)  // CLD is only available on Windows at this time.
   int num_languages = 0;
   bool is_reliable = false;
+  string16 input = WideToUTF16(text);
   Language cld_language =
-      DetectLanguageOfUnicodeText(NULL, text.c_str(), true, &is_reliable,
+      DetectLanguageOfUnicodeText(NULL, input.c_str(), true, &is_reliable,
                                   &num_languages, NULL);
   if (cld_language != NUM_LANGUAGES && cld_language != UNKNOWN_LANGUAGE &&
       cld_language != TG_UNKNOWN_LANGUAGE) {
-    language = LanguageCodeISO639_1(cld_language);
+    // We should not use LanguageCode_ISO_639_1 because it does not cover all the
+    // languages CLD can detect. As a result, it'll return the invalid language
+    // code for tradtional Chinese among others. |LanguageCode| will go through
+    // ISO 639-1, ISO-639-2 and 'other' tables to do the 'right' thing.
+    language = LanguageCode(cld_language);
   }
-#endif
   return language;
 }
 
diff --git a/chrome/renderer/render_view.h b/chrome/renderer/render_view.h
index 4d3a4e94337a4..036cb0bf2aee3 100644
--- a/chrome/renderer/render_view.h
+++ b/chrome/renderer/render_view.h
@@ -436,9 +436,15 @@ class RenderView : public RenderWidget,
 
   PageTranslator* page_translator() const { return page_translator_.get(); }
 
-  // Returns the ISO 639_1 language code of the current page
-  // (ex: en, fr, zh...).  Returns 'unknown' if the language could not be
-  // determined.
+  // Returns the ISO 639 language code of the current page (e.g. en, fr, zh).
+  // If ISO 639-1 code is not available for the language, ISO 639-2 3-letter code
+  // will be returned (e.g. kha for Khasi and und for undtermined). For traditional
+  // Chinse, 'zh-TW' will be returned while for simplified Chinse, 'zh' will be
+  // returned.
+  // TODO(jungshik): Make it return 'he' (the correct ISO 639 code for Hebrew)
+  // instead of the obsolete 'iw'. Perhaps, it's also better to return 'zh-Hans'
+  // (or 'zh-CN') for Simplified Chinese instead of 'zh' to be aligned with
+  // 'zh-TW' for Traditional Chinse.
   std::string DetectLanguage();
 
  protected:
diff --git a/chrome/test/data/extensions/good/Extensions/behllobkkfkfnphdnhnkndlbkcpglgmj/1.0.0.0/french_sentence.html b/chrome/test/data/extensions/good/Extensions/behllobkkfkfnphdnhnkndlbkcpglgmj/1.0.0.0/french_sentence.html
index 3d3c2e88d92d1..a7607f2da9624 100644
--- a/chrome/test/data/extensions/good/Extensions/behllobkkfkfnphdnhnkndlbkcpglgmj/1.0.0.0/french_sentence.html
+++ b/chrome/test/data/extensions/good/Extensions/behllobkkfkfnphdnhnkndlbkcpglgmj/1.0.0.0/french_sentence.html
@@ -4,6 +4,7 @@ source code is governed by a BSD-style license that can be found in the
 LICENSE file.
 -->
 <html>
+<meta charset="ISO-8859-1">
 <body>
 <p>
 Ceci est une phrase complète est en français, rédigé en anglais puis traduits
diff --git a/third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_scopedptr.h b/third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_scopedptr.h
index 7f182383a71ab..650e57898532f 100644
--- a/third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_scopedptr.h
+++ b/third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_scopedptr.h
@@ -5,13 +5,8 @@
 #ifndef BAR_TOOLBAR_CLD_I18N_ENCODINGS_COMPACT_LANG_DET_WIN_CLD_SCOPEDPTR_H_
 #define BAR_TOOLBAR_CLD_I18N_ENCODINGS_COMPACT_LANG_DET_WIN_CLD_SCOPEDPTR_H_
 
-#include <wincrypt.h>  // to compile common/scopedptr.h
-#include <wininet.h>   // to compile common/scopedptr.h
-
-// This include has to be out of order to compile to compile common/scopedptr.h
+// This include has to be out of order to compile common/scopedptr.h
 #include "bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_macros.h"
-#include "bar/common/scopedlibrary.h"
-#include "bar/common/scopedptr.h"
 #include "bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_scoped_ptr.h"
 
 #endif  // BAR_TOOLBAR_CLD_I18N_ENCODINGS_COMPACT_LANG_DET_WIN_CLD_SCOPEDPTR_H_
diff --git a/third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.cc b/third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.cc
index c1a4d952a287a..5b0e67e93184b 100644
--- a/third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.cc
+++ b/third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.cc
@@ -4,94 +4,46 @@
 
 #include "bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.h"
 
-#include <tchar.h>
-#include <windows.h>
-
+#include <string>
 #include <vector>  // to compile bar/common/component.h
 
 #include "bar/toolbar/cld/i18n/encodings/compact_lang_det/compact_lang_det.h"
-#include "bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_scopedptr.h"
-#include "bar/toolbar/cld/i18n/encodings/compact_lang_det/win/normalizedunicodetext.h"
+#include "base/string_util.h"
+#include "unicode/normlzr.h"
+#include "unicode/unistr.h"
+#include "unicode/ustring.h"
+
+std::string NormalizeText(const UChar* text) {
+  // To avoid a copy, use the read-only aliasing ctor.
+  icu::UnicodeString source(1, text, -1);
+  icu::UnicodeString normalized;
+  UErrorCode status = U_ZERO_ERROR;
+  icu::Normalizer::normalize(source, UNORM_NFC, 0, normalized, status);
+  if (U_FAILURE(status))
+    return std::string();
+  normalized.toLower();
+  std::string utf8;
+  // Internally, toUTF8String uses a 1kB stack buffer (which is not large enough
+  // for most web pages) and does pre-flighting followed by malloc for larger
+  // strings. We have to switch to obtaining the buffer with the maximum size
+  // (UTF-16 length * 3) without pre-flighting if necessary.
+  return normalized.toUTF8String(utf8);
+}
 
 
 // Detects a language of the UTF-16 encoded zero-terminated text.
 // Returns: Language enum.
 Language DetectLanguageOfUnicodeText(
     const CompactLangDet::DetectionTables* detection_tables,
-    const WCHAR* text, bool is_plain_text,
+    const UChar* text, bool is_plain_text,
     bool* is_reliable, int* num_languages,
-    DWORD* error_code) {
-  if (!text || !num_languages) {
-    if (error_code)
-      *error_code = ERROR_INVALID_PARAMETER;
-    return NUM_LANGUAGES;
-  }
-
-  // Normalize text first.  We do not check the return value here since there
-  // is no meaningful recovery we can do in case of failure anyway.
-  // Since the vast majority of texts on the Internet is already normalized
-  // and languages which require normalization are easy to recognize by CLD
-  // anyway, we'll benefit more from trying to detect language in non-normalized
-  // text (and, with some probability, fail to recognize it) than to give up
-  // right away and return the unknown language here.
-  NormalizedUnicodeText nomalized_text;
-  nomalized_text.Normalize(NormalizationC, text);
-
-  // Determine the size of the buffer required to store a lowercased text.
-  int lowercase_text_size =
-      ::LCMapString(NULL, LCMAP_LOWERCASE | LCMAP_LINGUISTIC_CASING,
-                    nomalized_text.get(), -1,
-                    NULL, 0);
-  if (!lowercase_text_size) {
-    if (error_code)
-      *error_code = ::GetLastError();
+    int* error_code) {
+  if (!text || !num_languages)
     return NUM_LANGUAGES;
-  }
-
-  scoped_array<WCHAR> lowercase_text(new WCHAR[lowercase_text_size]);
-  if (!lowercase_text.get())
-    return NUM_LANGUAGES;
-
-  // Covert text to lowercase.
-  int lowercasing_result =
-      ::LCMapString(NULL, LCMAP_LOWERCASE | LCMAP_LINGUISTIC_CASING,
-                    nomalized_text.get(), -1,
-                    lowercase_text.get(), lowercase_text_size);
-  if (!lowercasing_result) {
-    if (error_code)
-      *error_code = ::GetLastError();
+  // Normalize text to NFC, lowercase and convert to UTF-8.
+  std::string utf8_encoded = NormalizeText(text);
+  if (utf8_encoded.empty())
     return NUM_LANGUAGES;
-  }
-
-  // Determine the size of the buffer required to covert text to UTF-8.
-  int utf8_encoded_buffer_size =
-      ::WideCharToMultiByte(CP_UTF8, 0,
-                            lowercase_text.get(), -1,
-                            NULL, 0,
-                            NULL, NULL);
-  if (!utf8_encoded_buffer_size) {
-    if (error_code)
-      *error_code = ::GetLastError();
-    return NUM_LANGUAGES;
-  }
-
-  scoped_array<char> utf8_encoded_buffer(
-      new char[utf8_encoded_buffer_size]);
-
-  // Convert text to UTF-8.
-  int utf8_encoding_result =
-      ::WideCharToMultiByte(CP_UTF8, 0,
-                            lowercase_text.get(), -1,
-                            utf8_encoded_buffer.get(), utf8_encoded_buffer_size,
-                            NULL, NULL);
-  if (!utf8_encoding_result) {
-    if (error_code)
-      *error_code = ::GetLastError();
-    return NUM_LANGUAGES;
-  }
-
-  if (error_code)
-    *error_code = 0;
 
   // Engage core CLD library language detection.
   Language language3[3] = {
@@ -107,8 +59,8 @@ Language DetectLanguageOfUnicodeText(
   // language3 array is always set according to the detection results and
   // is not affected by this heuristic.
   CompactLangDet::DetectLanguageSummary(detection_tables,
-                                        utf8_encoded_buffer.get(),
-                                        utf8_encoded_buffer_size,
+                                        utf8_encoded.c_str(),
+                                        utf8_encoded.length(),
                                         is_plain_text, language3, percent3,
                                         &text_bytes, is_reliable);
 
diff --git a/third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.h b/third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.h
index 70306913041aa..c0d64aaf88a89 100644
--- a/third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.h
+++ b/third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.h
@@ -5,9 +5,8 @@
 #ifndef BAR_TOOLBAR_CLD_I18N_ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UNICODETEXT_H_
 #define BAR_TOOLBAR_CLD_I18N_ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UNICODETEXT_H_
 
-#include <windows.h>
-
 #include "bar/toolbar/cld/i18n/languages/public/languages.h"
+#include "unicode/utypes.h"
 
 namespace CompactLangDet {
   struct DetectionTables;
@@ -33,9 +32,9 @@ namespace CompactLangDet {
 //     for details.
 Language DetectLanguageOfUnicodeText(
     const CompactLangDet::DetectionTables* detection_tables,
-    const WCHAR* text, bool is_plain_text,
+    const UChar* text, bool is_plain_text,
     bool* is_reliable, int* num_languages,
-    DWORD* error_code);
+    int* error_code);
 
 
 #endif  // BAR_TOOLBAR_CLD_I18N_ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UNICODETEXT_H_
diff --git a/third_party/cld/base/string_util.h b/third_party/cld/base/string_util.h
index 365d1bf8c7a91..7717e5b6fa7fa 100644
--- a/third_party/cld/base/string_util.h
+++ b/third_party/cld/base/string_util.h
@@ -11,12 +11,18 @@
 
 namespace base {
 
+#ifdef WIN32
 // Compare the two strings s1 and s2 without regard to case using
 // the current locale; returns 0 if they are equal, 1 if s1 > s2, and -1 if
 // s2 > s1 according to a lexicographic comparison.
 inline int strcasecmp(const char* s1, const char* s2) {
   return _stricmp(s1, s2);
 }
+#else
+inline int strcasecmp(const char* s1, const char* s2) {
+  return strcasecmp(s1, s2);
+}
+#endif
 
 }
 
diff --git a/third_party/cld/cld.gyp b/third_party/cld/cld.gyp
index 2f3f192c738d7..6913f2eb3f007 100644
--- a/third_party/cld/cld.gyp
+++ b/third_party/cld/cld.gyp
@@ -3,119 +3,133 @@
 # found in the LICENSE file.
 
 {
-  'conditions': [
-    ['OS=="win"', {
-      'targets': [
-        {
-          'target_name': 'cld',
-          'type': '<(library)',
-          'include_dirs': [
-            '.',
-          ],
-          'msvs_disabled_warnings': [4005, 4006, 4018, 4244, 4309, 4800],
-          'defines': [
-            'CLD_WINDOWS',
-          ],
-          'sources': [
-            'bar/common/scopedlibrary.h',
-            'bar/common/scopedptr.h',
-            'bar/toolbar/cld/i18n/encodings/compact_lang_det/cldutil.cc',
-            'bar/toolbar/cld/i18n/encodings/compact_lang_det/cldutil.h',
-            'bar/toolbar/cld/i18n/encodings/compact_lang_det/cldutil_dbg.h',
-            'bar/toolbar/cld/i18n/encodings/compact_lang_det/cldutil_dbg_empty.cc',
-            'bar/toolbar/cld/i18n/encodings/compact_lang_det/compact_lang_det.cc',
-            'bar/toolbar/cld/i18n/encodings/compact_lang_det/compact_lang_det.h',
-            'bar/toolbar/cld/i18n/encodings/compact_lang_det/compact_lang_det_impl.cc',
-            'bar/toolbar/cld/i18n/encodings/compact_lang_det/compact_lang_det_impl.h',
-            'bar/toolbar/cld/i18n/encodings/compact_lang_det/ext_lang_enc.cc',
-            'bar/toolbar/cld/i18n/encodings/compact_lang_det/ext_lang_enc.h',
-            'bar/toolbar/cld/i18n/encodings/compact_lang_det/getonescriptspan.cc',
-            'bar/toolbar/cld/i18n/encodings/compact_lang_det/getonescriptspan.h',
-            'bar/toolbar/cld/i18n/encodings/compact_lang_det/letterscript_enum.cc',
-            'bar/toolbar/cld/i18n/encodings/compact_lang_det/letterscript_enum.h',
-            'bar/toolbar/cld/i18n/encodings/compact_lang_det/subsetsequence.cc',
-            'bar/toolbar/cld/i18n/encodings/compact_lang_det/subsetsequence.h',
-            'bar/toolbar/cld/i18n/encodings/compact_lang_det/tote.cc',
-            'bar/toolbar/cld/i18n/encodings/compact_lang_det/tote.h',
-            'bar/toolbar/cld/i18n/encodings/compact_lang_det/utf8propjustletter.h',
-            'bar/toolbar/cld/i18n/encodings/compact_lang_det/utf8propletterscriptnum.h',
-            'bar/toolbar/cld/i18n/encodings/compact_lang_det/utf8scannotjustletterspecial.h',
-            'bar/toolbar/cld/i18n/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc',
-            'bar/toolbar/cld/i18n/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc',
-            'bar/toolbar/cld/i18n/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc',
-            'bar/toolbar/cld/i18n/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h',
-            'bar/toolbar/cld/i18n/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_128.cc',
-            # For now using the 128 bytes detection in order to save hundreds of KBs on the final package.
-            # 'bar/toolbar/cld/i18n/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_256.cc',
-            'bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_basictypes.h',
-            'bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_commandlineflags.h',
-            # We use the static table at this point, so we don't need to compile the following files:
-            #'bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_dynamicstate.h',
-            #'bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_dynamicstate.cc',
-            #'bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_loadpolicy.cc',
-            #'bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_loadpolicy.h',
-            #'bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_loadpolicyinterface.h',
-            #'bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_resourceids.h',
-            #'bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_service.h',
-            #'bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_service.cc',
-            #'bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_serviceinterface.h',
-            #'bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_tables.cc',
-            #'bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_tables.h',
-            #'bar/toolbar/cld/i18n/encodings/compact_lang_det/win/resourceinmemory.cc',
-            #'bar/toolbar/cld/i18n/encodings/compact_lang_det/win/resourceinmemory.h',
-            'bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_google.h',
-            'bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_htmlutils.h',
-            'bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_htmlutils_windows.cc',
-            'bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_logging.h',
-            'bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_macros.h',
-            'bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_scoped_ptr.h',
-            'bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_scopedptr.h',
-            'bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_strtoint.h',
-            'bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.cc',
-            'bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.h',
-            'bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unilib.h',
-            'bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unilib_windows.cc',
-            'bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_utf.h',
-            'bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_utf8statetable.cc',
-            'bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_utf8statetable.h',
-            'bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_utf8utils.h',
-            'bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_utf8utils_windows.cc',
-            'bar/toolbar/cld/i18n/encodings/compact_lang_det/win/normalizedunicodetext.cc',
-            'bar/toolbar/cld/i18n/encodings/compact_lang_det/win/normalizedunicodetext.h',
-            'bar/toolbar/cld/i18n/encodings/internal/encodings.cc',
-            'bar/toolbar/cld/i18n/encodings/proto/encodings.pb.h',
-            'bar/toolbar/cld/i18n/encodings/public/encodings.h',
-            'bar/toolbar/cld/i18n/languages/internal/languages.cc',
-            'bar/toolbar/cld/i18n/languages/proto/languages.pb.h',
-            'bar/toolbar/cld/i18n/languages/public/languages.h',
-            'base/basictypes.h',
-            'base/build_config.h',
-            'base/casts.h',
-            'base/commandlineflags.h',
-            'base/global_strip_options.h',
-            'base/logging.h',
-            'base/macros.h',
-            'base/port.h',
-            'base/crash.h',
-            'base/dynamic_annotations.h',
-            'base/scoped_ptr.h',
-            'base/stl_decl_msvc.h',
-            'base/log_severity.h',
-            'base/strtoint.h',
-            'base/vlog_is_on.h',
-            'base/string_util.h',
-            'base/type_traits.h',
-            'base/template_util.h',
-          ],
-          'direct_dependent_settings': {
-            'defines': [
-              'CLD_WINDOWS',
-              'COMPILER_MSVC',
-            ],
-          },
-        },],
+  'targets': [
+    {
+      'target_name': 'cld',
+      'type': '<(library)',
+      'dependencies': [
+	'../icu/icu.gyp:icuuc',
+      ],
+      'include_dirs': [
+        '.',
+      ],
+      'defines': [
+        'CLD_WINDOWS',
+      ],
+      'sources': [
+        'bar/common/scopedptr.h',
+        'bar/toolbar/cld/i18n/encodings/compact_lang_det/cldutil.cc',
+        'bar/toolbar/cld/i18n/encodings/compact_lang_det/cldutil.h',
+        'bar/toolbar/cld/i18n/encodings/compact_lang_det/cldutil_dbg.h',
+        'bar/toolbar/cld/i18n/encodings/compact_lang_det/cldutil_dbg_empty.cc',
+        'bar/toolbar/cld/i18n/encodings/compact_lang_det/compact_lang_det.cc',
+        'bar/toolbar/cld/i18n/encodings/compact_lang_det/compact_lang_det.h',
+        'bar/toolbar/cld/i18n/encodings/compact_lang_det/compact_lang_det_impl.cc',
+        'bar/toolbar/cld/i18n/encodings/compact_lang_det/compact_lang_det_impl.h',
+        'bar/toolbar/cld/i18n/encodings/compact_lang_det/ext_lang_enc.cc',
+        'bar/toolbar/cld/i18n/encodings/compact_lang_det/ext_lang_enc.h',
+        'bar/toolbar/cld/i18n/encodings/compact_lang_det/getonescriptspan.cc',
+        'bar/toolbar/cld/i18n/encodings/compact_lang_det/getonescriptspan.h',
+        'bar/toolbar/cld/i18n/encodings/compact_lang_det/letterscript_enum.cc',
+        'bar/toolbar/cld/i18n/encodings/compact_lang_det/letterscript_enum.h',
+        'bar/toolbar/cld/i18n/encodings/compact_lang_det/subsetsequence.cc',
+        'bar/toolbar/cld/i18n/encodings/compact_lang_det/subsetsequence.h',
+        'bar/toolbar/cld/i18n/encodings/compact_lang_det/tote.cc',
+        'bar/toolbar/cld/i18n/encodings/compact_lang_det/tote.h',
+        'bar/toolbar/cld/i18n/encodings/compact_lang_det/utf8propjustletter.h',
+        'bar/toolbar/cld/i18n/encodings/compact_lang_det/utf8propletterscriptnum.h',
+        'bar/toolbar/cld/i18n/encodings/compact_lang_det/utf8scannotjustletterspecial.h',
+        'bar/toolbar/cld/i18n/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc',
+        'bar/toolbar/cld/i18n/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc',
+        'bar/toolbar/cld/i18n/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc',
+        'bar/toolbar/cld/i18n/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h',
+        'bar/toolbar/cld/i18n/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_128.cc',
+        # For now using the 128 bytes detection in order to save hundreds of KBs on the final package.
+        # 'bar/toolbar/cld/i18n/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_256.cc',
+        'bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_basictypes.h',
+        'bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_commandlineflags.h',
+        # We use the static table at this point, so we don't need to compile the following files:
+        #'bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_dynamicstate.h',
+        #'bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_dynamicstate.cc',
+        #'bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_loadpolicy.cc',
+        #'bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_loadpolicy.h',
+        #'bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_loadpolicyinterface.h',
+        #'bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_resourceids.h',
+        #'bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_service.h',
+        #'bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_service.cc',
+        #'bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_serviceinterface.h',
+        #'bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_tables.cc',
+        #'bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_tables.h',
+        #'bar/toolbar/cld/i18n/encodings/compact_lang_det/win/resourceinmemory.cc',
+        #'bar/toolbar/cld/i18n/encodings/compact_lang_det/win/resourceinmemory.h',
+        'bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_google.h',
+        'bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_htmlutils.h',
+        'bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_htmlutils_windows.cc',
+        'bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_logging.h',
+        'bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_macros.h',
+	# None of files we build require these two headers.
+        #'bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_scoped_ptr.h',
+        #'bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_scopedptr.h',
+        'bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_strtoint.h',
+        'bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.cc',
+        'bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.h',
+        'bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unilib.h',
+        'bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unilib_windows.cc',
+        'bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_utf.h',
+        'bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_utf8statetable.cc',
+        'bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_utf8statetable.h',
+        'bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_utf8utils.h',
+        'bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_utf8utils_windows.cc',
+        'bar/toolbar/cld/i18n/encodings/internal/encodings.cc',
+        'bar/toolbar/cld/i18n/encodings/proto/encodings.pb.h',
+        'bar/toolbar/cld/i18n/encodings/public/encodings.h',
+        'bar/toolbar/cld/i18n/languages/internal/languages.cc',
+        'bar/toolbar/cld/i18n/languages/proto/languages.pb.h',
+        'bar/toolbar/cld/i18n/languages/public/languages.h',
+        'base/basictypes.h',
+        'base/build_config.h',
+        'base/casts.h',
+        'base/commandlineflags.h',
+        'base/global_strip_options.h',
+        'base/logging.h',
+        'base/macros.h',
+        'base/port.h',
+        'base/crash.h',
+        'base/dynamic_annotations.h',
+        'base/scoped_ptr.h',
+        'base/stl_decl_msvc.h',
+        'base/log_severity.h',
+        'base/strtoint.h',
+        'base/vlog_is_on.h',
+        'base/string_util.h',
+        'base/type_traits.h',
+        'base/template_util.h',
+      ],
+      'direct_dependent_settings': {
+        'defines': [
+          'CLD_WINDOWS',
+        ],
       },
-    ],
+      'conditions': [
+        ['OS=="win"', {
+              'direct_dependent_settings': {
+                'defines': [
+                  'COMPILER_MSVC',
+                ],
+              },
+              'msvs_disabled_warnings': [4005, 4006, 4018, 4244, 4309, 4800],
+            },
+        ],
+        ['OS!="win"', {
+              'direct_dependent_settings': {
+                'defines': [
+                  'COMPILER_GCC',
+                ],
+              },
+            },
+        ],
+      ],
+    },
   ],
 }
 
-- 
GitLab