diff --git a/libraries/libuchardet/README b/libraries/libuchardet/README index 9822181500..62c9882143 100644 --- a/libraries/libuchardet/README +++ b/libraries/libuchardet/README @@ -6,3 +6,7 @@ implementation of the universal charset detection library by Mozilla. uchardet is an encoding detector library, which takes a sequence of bytes in an unknown character encoding without any additional information, and attempts to determine the encoding of the text. +Returned encoding names are iconv-compatible. + +It can now detect more charsets, and more reliably than the original +implementation. diff --git a/libraries/libuchardet/libuchardet.SlackBuild b/libraries/libuchardet/libuchardet.SlackBuild index 42e440807c..aa2b6cbbb4 100644 --- a/libraries/libuchardet/libuchardet.SlackBuild +++ b/libraries/libuchardet/libuchardet.SlackBuild @@ -1,8 +1,8 @@ #!/bin/sh - +# # Slackware build script for libuchardet. - -# Copyright 2015 Edinaldo P. Silva, Rio de Janeiro, Brazil. +# +# Copyright 2015-2016 Edinaldo P. Silva, Rio de Janeiro, Brazil. # All rights reserved. # # Redistribution and use of this script, with or without modification, is @@ -23,13 +23,13 @@ # ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PRGNAM=libuchardet -VERSION=${VERSION:-0.0.5} +VERSION=${VERSION:-0.0.6} BUILD=${BUILD:-1} TAG=${TAG:-_SBo} if [ -z "$ARCH" ]; then case "$( uname -m )" in - i?86) ARCH=i486 ;; + i?86) ARCH=i586 ;; arm*) ARCH=arm ;; *) ARCH=$( uname -m ) ;; esac @@ -40,7 +40,7 @@ TMP=${TMP:-/tmp/SBo} PKG=$TMP/package-$PRGNAM OUTPUT=${OUTPUT:-/tmp} -if [ "$ARCH" = "i486" ]; then +if [ "$ARCH" = "i586" ]; then SLKCFLAGS="-O2 -march=i686 -mtune=i686" LIBDIRSUFFIX="" elif [ "$ARCH" = "i686" ]; then @@ -62,7 +62,7 @@ rm -rf $PKG mkdir -p $TMP $PKG $OUTPUT cd $TMP rm -rf $SRCNAM-$VERSION -tar xvf $CWD/$SRCNAM-$VERSION.tar.gz +tar xvf $CWD/$SRCNAM-$VERSION.tar.xz cd $SRCNAM-$VERSION chown -R root:root . find -L . \ @@ -71,16 +71,12 @@ find -L . \ \( -perm 666 -o -perm 664 -o -perm 640 -o -perm 600 -o -perm 444 \ -o -perm 440 -o -perm 400 \) -exec chmod 644 {} \; -patch -Np1 < $CWD/uchardet-0.0.5-fix-ASCII-detection.patch -patch -Np1 < $CWD/uchardet-0.0.5-use-proper-package-name.patch - cmake \ -DCMAKE_CXX_FLAGS:STRING="$SLKCFLAGS" \ -DCMAKE_INSTALL_PREFIX=/usr \ -DCMAKE_INSTALL_LIBDIR=/usr/lib${LIBDIRSUFFIX} \ . make -#make test make install DESTDIR=$PKG find $PKG -print0 | xargs -0 file | grep -e "executable" -e "shared object" | grep ELF \ diff --git a/libraries/libuchardet/libuchardet.info b/libraries/libuchardet/libuchardet.info index e58f5e0b7b..1a881d06e5 100644 --- a/libraries/libuchardet/libuchardet.info +++ b/libraries/libuchardet/libuchardet.info @@ -1,8 +1,8 @@ PRGNAM="libuchardet" -VERSION="0.0.5" -HOMEPAGE="https://github.com/BYVoid/uchardet" -DOWNLOAD="https://github.com/BYVoid/uchardet/archive/v0.0.5/uchardet-0.0.5.tar.gz" -MD5SUM="2421993e7b098366bd008d81385150b6" +VERSION="0.0.6" +HOMEPAGE="https://www.freedesktop.org/wiki/Software/uchardet/" +DOWNLOAD="https://www.freedesktop.org/software/uchardet/releases/uchardet-0.0.6.tar.xz" +MD5SUM="03425c0bbe5faaf399e15e947d3e03c7" DOWNLOAD_x86_64="" MD5SUM_x86_64="" REQUIRES="" diff --git a/libraries/libuchardet/slack-desc b/libraries/libuchardet/slack-desc index 11f882aabd..c3d801fbe6 100644 --- a/libraries/libuchardet/slack-desc +++ b/libraries/libuchardet/slack-desc @@ -13,7 +13,7 @@ libuchardet: implementation of the universal charset detection library by Mozill libuchardet: uchardet is an encoding detector library, which takes a sequence of libuchardet: bytes in an unknown character encoding without any additional libuchardet: information, and attempts to determine the encoding of the text. +libuchardet: Returned encoding names are iconv-compatible. libuchardet: -libuchardet: Home page: https://github.com/BYVoid/uchardet/ -libuchardet: +libuchardet: Home page: https://www.freedesktop.org/wiki/Software/uchardet/ libuchardet: diff --git a/libraries/libuchardet/uchardet-0.0.5-fix-ASCII-detection.patch b/libraries/libuchardet/uchardet-0.0.5-fix-ASCII-detection.patch deleted file mode 100644 index c82aee866e..0000000000 --- a/libraries/libuchardet/uchardet-0.0.5-fix-ASCII-detection.patch +++ /dev/null @@ -1,116 +0,0 @@ -commit 4c8316f9cfda38d75fb015c0eb40e0eebb03d28f -Author: Jehan -Date: Sat Dec 5 21:04:20 2015 +0100 - - Nearly-ASCII text with NBSP is still not ASCII. - - There is no "exception" in encoding. The non-breaking space 0xA0 is not - ASCII, and therefore returning "ASCII" will later create issues (for - instance trying to re-encode with iconv produces an error). - This was obviously an explicit decision in original code (according to - code comments), probably tied to specifity of the original program from - Mozilla. Now we want strict detection. - I will return "ISO-8859-1" for "nearly-ASCII texts with NBSP as only - exception" (note that I could have returned any ISO-8859 charsets since - they all have this character in common). - -diff --git a/src/nsUniversalDetector.cpp b/src/nsUniversalDetector.cpp -index ab8bae0..ff06b9d 100644 ---- a/src/nsUniversalDetector.cpp -+++ b/src/nsUniversalDetector.cpp -@@ -47,6 +47,7 @@ - - nsUniversalDetector::nsUniversalDetector(PRUint32 aLanguageFilter) - { -+ mNbspFound = PR_FALSE; - mDone = PR_FALSE; - mBestGuess = -1; //illegal value as signal - mInTag = PR_FALSE; -@@ -75,6 +76,7 @@ nsUniversalDetector::~nsUniversalDetector() - void - nsUniversalDetector::Reset() - { -+ mNbspFound = PR_FALSE; - mDone = PR_FALSE; - mBestGuess = -1; //illegal value as signal - mInTag = PR_FALSE; -@@ -162,9 +164,10 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen) - PRUint32 i; - for (i = 0; i < aLen; i++) - { -- /* Other than 0xA0, if every other character is ASCII, the page is ASCII. -+ /* If every other character is ASCII or 0xA0, we don't run charset -+ * probers. - * 0xA0 (NBSP in a few charset) is apparently a rare exception -- * of non-ASCII character contained in ASCII text. */ -+ * of non-ASCII character often contained in nearly-ASCII text. */ - if (aBuf[i] & '\x80' && aBuf[i] != '\xA0') - { - /* We got a non-ASCII byte (high-byte) */ -@@ -203,11 +206,19 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen) - } - else - { -- //ok, just pure ascii so far -- if ( ePureAscii == mInputState && -- (aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~')) ) -+ /* Just pure ASCII or NBSP so far. */ -+ if (aBuf[i] == '\xA0') - { -- //found escape character or HZ "~{" -+ /* ASCII with the only exception of NBSP seems quite common. -+ * I doubt it is really necessary to train a model here, so let's -+ * just make an exception. -+ */ -+ mNbspFound = PR_TRUE; -+ } -+ else if (mInputState == ePureAscii && -+ (aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~'))) -+ { -+ /* We found an escape character or HZ "~{". */ - mInputState = eEscAscii; - } - mLastChar = aBuf[i]; -@@ -229,6 +240,10 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen) - mDone = PR_TRUE; - mDetectedCharset = mEscCharSetProber->GetCharSetName(); - } -+ else if (mNbspFound) -+ { -+ mDetectedCharset = "ISO-8859-1"; -+ } - else - { - /* ASCII with the ESC character (or the sequence "~{") is still -@@ -253,8 +268,17 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen) - break; - - default: -- /* Pure ASCII */ -- mDetectedCharset = "ASCII"; -+ if (mNbspFound) -+ { -+ /* ISO-8859-1 is a good result candidate for ASCII + NBSP. -+ * (though it could have been any ISO-8859 encoding). */ -+ mDetectedCharset = "ISO-8859-1"; -+ } -+ else -+ { -+ /* Pure ASCII */ -+ mDetectedCharset = "ASCII"; -+ } - break; - } - return NS_OK; -diff --git a/src/nsUniversalDetector.h b/src/nsUniversalDetector.h -index 4d9b460..9f0a4b1 100644 ---- a/src/nsUniversalDetector.h -+++ b/src/nsUniversalDetector.h -@@ -72,6 +72,7 @@ protected: - virtual void Report(const char* aCharset) = 0; - virtual void Reset(); - nsInputState mInputState; -+ PRBool mNbspFound; - PRBool mDone; - PRBool mInTag; - PRBool mStart; diff --git a/libraries/libuchardet/uchardet-0.0.5-use-proper-package-name.patch b/libraries/libuchardet/uchardet-0.0.5-use-proper-package-name.patch deleted file mode 100644 index b1ed88991c..0000000000 --- a/libraries/libuchardet/uchardet-0.0.5-use-proper-package-name.patch +++ /dev/null @@ -1,30 +0,0 @@ -commit b6d872bbec3be7abfccbdfd3d90e784cf7281c55 -Author: Jehan -Date: Tue Dec 15 21:40:16 2015 +0100 - - app: package name wrong in CMakeLists.txt. - - Probably coming from a copy-paste error when the build system was - originally created. - -diff --git a/CMakeLists.txt b/CMakeLists.txt -index 0b65c49..4f279e1 100644 ---- a/CMakeLists.txt -+++ b/CMakeLists.txt -@@ -1,6 +1,6 @@ - ######## Project settings - cmake_minimum_required(VERSION 2.8) --set (PACKAGE_NAME opencc) -+set (PACKAGE_NAME uchardet) - project (${PACKAGE_NAME} CXX C) - enable_testing() - -@@ -54,7 +54,7 @@ if (DEFINED SYSCONF_INSTALL_DIR) - set (DIR_ETC ${SYSCONF_INSTALL_DIR}) - endif (DEFINED SYSCONF_INSTALL_DIR) - --set (DIR_SHARE_UCHARDET ${DIR_SHARE}/opencc) -+set (DIR_SHARE_UCHARDET ${DIR_SHARE}/uchardet) - set (DIR_SHARE_LOCALE ${DIR_SHARE}/locale) - - ######## Configuration