libraries/libuchardet: Updated for version 0.0.6.

Signed-off-by: Willy Sudiarto Raharjo <willysr@slackbuilds.org>
This commit is contained in:
Edinaldo P. Silva 2016-10-13 13:17:24 +07:00 committed by Willy Sudiarto Raharjo
parent d679df6b8f
commit 1fae8150cf
6 changed files with 17 additions and 163 deletions

View File

@ -6,3 +6,7 @@ implementation of the universal charset detection library by Mozilla.
uchardet is an encoding detector library, which takes a sequence of
bytes in an unknown character encoding without any additional
information, and attempts to determine the encoding of the text.
Returned encoding names are iconv-compatible.
It can now detect more charsets, and more reliably than the original
implementation.

View File

@ -1,8 +1,8 @@
#!/bin/sh
#
# Slackware build script for libuchardet.
# Copyright 2015 Edinaldo P. Silva, Rio de Janeiro, Brazil.
#
# Copyright 2015-2016 Edinaldo P. Silva, Rio de Janeiro, Brazil.
# All rights reserved.
#
# Redistribution and use of this script, with or without modification, is
@ -23,13 +23,13 @@
# ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PRGNAM=libuchardet
VERSION=${VERSION:-0.0.5}
VERSION=${VERSION:-0.0.6}
BUILD=${BUILD:-1}
TAG=${TAG:-_SBo}
if [ -z "$ARCH" ]; then
case "$( uname -m )" in
i?86) ARCH=i486 ;;
i?86) ARCH=i586 ;;
arm*) ARCH=arm ;;
*) ARCH=$( uname -m ) ;;
esac
@ -40,7 +40,7 @@ TMP=${TMP:-/tmp/SBo}
PKG=$TMP/package-$PRGNAM
OUTPUT=${OUTPUT:-/tmp}
if [ "$ARCH" = "i486" ]; then
if [ "$ARCH" = "i586" ]; then
SLKCFLAGS="-O2 -march=i686 -mtune=i686"
LIBDIRSUFFIX=""
elif [ "$ARCH" = "i686" ]; then
@ -62,7 +62,7 @@ rm -rf $PKG
mkdir -p $TMP $PKG $OUTPUT
cd $TMP
rm -rf $SRCNAM-$VERSION
tar xvf $CWD/$SRCNAM-$VERSION.tar.gz
tar xvf $CWD/$SRCNAM-$VERSION.tar.xz
cd $SRCNAM-$VERSION
chown -R root:root .
find -L . \
@ -71,16 +71,12 @@ find -L . \
\( -perm 666 -o -perm 664 -o -perm 640 -o -perm 600 -o -perm 444 \
-o -perm 440 -o -perm 400 \) -exec chmod 644 {} \;
patch -Np1 < $CWD/uchardet-0.0.5-fix-ASCII-detection.patch
patch -Np1 < $CWD/uchardet-0.0.5-use-proper-package-name.patch
cmake \
-DCMAKE_CXX_FLAGS:STRING="$SLKCFLAGS" \
-DCMAKE_INSTALL_PREFIX=/usr \
-DCMAKE_INSTALL_LIBDIR=/usr/lib${LIBDIRSUFFIX} \
.
make
#make test
make install DESTDIR=$PKG
find $PKG -print0 | xargs -0 file | grep -e "executable" -e "shared object" | grep ELF \

View File

@ -1,8 +1,8 @@
PRGNAM="libuchardet"
VERSION="0.0.5"
HOMEPAGE="https://github.com/BYVoid/uchardet"
DOWNLOAD="https://github.com/BYVoid/uchardet/archive/v0.0.5/uchardet-0.0.5.tar.gz"
MD5SUM="2421993e7b098366bd008d81385150b6"
VERSION="0.0.6"
HOMEPAGE="https://www.freedesktop.org/wiki/Software/uchardet/"
DOWNLOAD="https://www.freedesktop.org/software/uchardet/releases/uchardet-0.0.6.tar.xz"
MD5SUM="03425c0bbe5faaf399e15e947d3e03c7"
DOWNLOAD_x86_64=""
MD5SUM_x86_64=""
REQUIRES=""

View File

@ -13,7 +13,7 @@ libuchardet: implementation of the universal charset detection library by Mozill
libuchardet: uchardet is an encoding detector library, which takes a sequence of
libuchardet: bytes in an unknown character encoding without any additional
libuchardet: information, and attempts to determine the encoding of the text.
libuchardet: Returned encoding names are iconv-compatible.
libuchardet:
libuchardet: Home page: https://github.com/BYVoid/uchardet/
libuchardet:
libuchardet: Home page: https://www.freedesktop.org/wiki/Software/uchardet/
libuchardet:

View File

@ -1,116 +0,0 @@
commit 4c8316f9cfda38d75fb015c0eb40e0eebb03d28f
Author: Jehan <jehan@girinstud.io>
Date: Sat Dec 5 21:04:20 2015 +0100
Nearly-ASCII text with NBSP is still not ASCII.
There is no "exception" in encoding. The non-breaking space 0xA0 is not
ASCII, and therefore returning "ASCII" will later create issues (for
instance trying to re-encode with iconv produces an error).
This was obviously an explicit decision in original code (according to
code comments), probably tied to specifity of the original program from
Mozilla. Now we want strict detection.
I will return "ISO-8859-1" for "nearly-ASCII texts with NBSP as only
exception" (note that I could have returned any ISO-8859 charsets since
they all have this character in common).
diff --git a/src/nsUniversalDetector.cpp b/src/nsUniversalDetector.cpp
index ab8bae0..ff06b9d 100644
--- a/src/nsUniversalDetector.cpp
+++ b/src/nsUniversalDetector.cpp
@@ -47,6 +47,7 @@
nsUniversalDetector::nsUniversalDetector(PRUint32 aLanguageFilter)
{
+ mNbspFound = PR_FALSE;
mDone = PR_FALSE;
mBestGuess = -1; //illegal value as signal
mInTag = PR_FALSE;
@@ -75,6 +76,7 @@ nsUniversalDetector::~nsUniversalDetector()
void
nsUniversalDetector::Reset()
{
+ mNbspFound = PR_FALSE;
mDone = PR_FALSE;
mBestGuess = -1; //illegal value as signal
mInTag = PR_FALSE;
@@ -162,9 +164,10 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
PRUint32 i;
for (i = 0; i < aLen; i++)
{
- /* Other than 0xA0, if every other character is ASCII, the page is ASCII.
+ /* If every other character is ASCII or 0xA0, we don't run charset
+ * probers.
* 0xA0 (NBSP in a few charset) is apparently a rare exception
- * of non-ASCII character contained in ASCII text. */
+ * of non-ASCII character often contained in nearly-ASCII text. */
if (aBuf[i] & '\x80' && aBuf[i] != '\xA0')
{
/* We got a non-ASCII byte (high-byte) */
@@ -203,11 +206,19 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
}
else
{
- //ok, just pure ascii so far
- if ( ePureAscii == mInputState &&
- (aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~')) )
+ /* Just pure ASCII or NBSP so far. */
+ if (aBuf[i] == '\xA0')
{
- //found escape character or HZ "~{"
+ /* ASCII with the only exception of NBSP seems quite common.
+ * I doubt it is really necessary to train a model here, so let's
+ * just make an exception.
+ */
+ mNbspFound = PR_TRUE;
+ }
+ else if (mInputState == ePureAscii &&
+ (aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~')))
+ {
+ /* We found an escape character or HZ "~{". */
mInputState = eEscAscii;
}
mLastChar = aBuf[i];
@@ -229,6 +240,10 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
mDone = PR_TRUE;
mDetectedCharset = mEscCharSetProber->GetCharSetName();
}
+ else if (mNbspFound)
+ {
+ mDetectedCharset = "ISO-8859-1";
+ }
else
{
/* ASCII with the ESC character (or the sequence "~{") is still
@@ -253,8 +268,17 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
break;
default:
- /* Pure ASCII */
- mDetectedCharset = "ASCII";
+ if (mNbspFound)
+ {
+ /* ISO-8859-1 is a good result candidate for ASCII + NBSP.
+ * (though it could have been any ISO-8859 encoding). */
+ mDetectedCharset = "ISO-8859-1";
+ }
+ else
+ {
+ /* Pure ASCII */
+ mDetectedCharset = "ASCII";
+ }
break;
}
return NS_OK;
diff --git a/src/nsUniversalDetector.h b/src/nsUniversalDetector.h
index 4d9b460..9f0a4b1 100644
--- a/src/nsUniversalDetector.h
+++ b/src/nsUniversalDetector.h
@@ -72,6 +72,7 @@ protected:
virtual void Report(const char* aCharset) = 0;
virtual void Reset();
nsInputState mInputState;
+ PRBool mNbspFound;
PRBool mDone;
PRBool mInTag;
PRBool mStart;

View File

@ -1,30 +0,0 @@
commit b6d872bbec3be7abfccbdfd3d90e784cf7281c55
Author: Jehan <jehan@girinstud.io>
Date: Tue Dec 15 21:40:16 2015 +0100
app: package name wrong in CMakeLists.txt.
Probably coming from a copy-paste error when the build system was
originally created.
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0b65c49..4f279e1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,6 @@
######## Project settings
cmake_minimum_required(VERSION 2.8)
-set (PACKAGE_NAME opencc)
+set (PACKAGE_NAME uchardet)
project (${PACKAGE_NAME} CXX C)
enable_testing()
@@ -54,7 +54,7 @@ if (DEFINED SYSCONF_INSTALL_DIR)
set (DIR_ETC ${SYSCONF_INSTALL_DIR})
endif (DEFINED SYSCONF_INSTALL_DIR)
-set (DIR_SHARE_UCHARDET ${DIR_SHARE}/opencc)
+set (DIR_SHARE_UCHARDET ${DIR_SHARE}/uchardet)
set (DIR_SHARE_LOCALE ${DIR_SHARE}/locale)
######## Configuration