feat: Add DTextEncoding class.

1. Add DTextEncoding class, provides encoding detection and encoding conversion.
2. Add interface unit test cases, example and comment document.
3. DTextEncoding dynamically loads libuchardet.so and libicuuc.so to support
   extended text encodings.

Log: Add DTextEncoding class.
Influence: Add build dependencies libuchardet-dev and libicu-dev.
This commit is contained in:
renbin 2022-12-13 14:07:02 +08:00 committed by deepin-bot[bot]
parent 88fc2ffafc
commit 64bbdc979b
13 changed files with 930 additions and 2 deletions

View File

@ -7,7 +7,7 @@ arch=('x86_64' 'aarch64')
url="https://github.com/linuxdeepin/dtkcore"
license=('LGPL3')
depends=('dconf' 'deepin-desktop-base-git' 'python' 'gsettings-qt' 'lshw')
makedepends=('git' 'qt5-tools' 'dtkcommon-git' 'ninja' 'cmake' 'doxygen')
makedepends=('git' 'qt5-tools' 'dtkcommon-git' 'ninja' 'cmake' 'doxygen' 'uchardet' 'icu')
conflicts=('dtkcore')
provides=('dtkcore')
groups=('deepin-git')

3
debian/control vendored
View File

@ -4,7 +4,8 @@ Priority: optional
Maintainer: Deepin Packages Builder <packages@deepin.com>
Build-Depends: debhelper (>= 9), pkg-config,
qttools5-dev-tools, qtbase5-private-dev, doxygen,
libgsettings-qt-dev, libgtest-dev, libdtkcommon-dev, cmake
libgsettings-qt-dev, libgtest-dev, libdtkcommon-dev, cmake,
libuchardet-dev, libicu-dev
Standards-Version: 3.9.8
Package: libdtkcore5

View File

@ -0,0 +1,58 @@
/*!
@~chinese
@ingroup dutil
@file include/util/dtextencoding.h
@details 本文件包含文本编码识别和文本编码转换的公共接口。
@class Dtk::Core::DTextEncoding
@brief 文本编码信息类,提供文本编码识别和文本编码转换的公共接口。
@details 提供文本编码识别和文本编码转换的公共接口,默认使用 QTextCodec 进行检测,
若系统环境中存在 libuchardet.so 及 libicuuc.so 库,可拓展支持的编码格式。
@fn QByteArray Dtk::Core::DTextEncoding::detectTextEncoding(const QByteArray &content)
@brief 检测给定文本的编码格式。
@details 默认使用 QTextCodec 检测,若系统环境中存在 libuchardet.so 及 libicuuc.so 库,可拓展支持的编码格式。
检测会判断最接近的编码格式,未成功识别或为 ASCII 编码格式,将返回 UTF-8 编码格式。
@param[in] content 待检测的文本内容
@return 文本编码格式
@fn QByteArray Dtk::Core::DTextEncoding::detectFileEncoding(const QString &fileName, bool *isOk)
@brief 检测给定文件的文本编码格式,将读取文件头部最多 64KB 的文本用于检测。若文件访问失败,返回空编码格式。
@param[in] fileName 文件路径
@param[out] isOk 检测是否成功,主要判断文件内容能否正确读取
@return 文本编码格式
@sa DTextEncoding::detectTextEncoding
@fn bool Dtk::Core::DTextEncoding::convertTextEncoding(QByteArray &content, QByteArray &outContent, const QByteArray &toEncoding, const QByteArray &fromEncoding, QString *errString)
@brief 将输入的文本 `content` 从 `fromEncoding` 编码格式转换到 `toEncoding` 编码格式,转换后的文本保存到 `outContent` 。
若转换过程中出现错误,将返回 false , 并设置 `errString` 错误信息。
@note 当处理大量文本数据时,需考虑并行处理,防止阻塞线程。
@param[in] content 传入的文本
@param[out] outContent 编码转换后的文本
@param[in] toEncoding 转换的编码格式
@param[in] fromEncoding 原始的编码格式,默认为空,会通过 `DTextEncoding::detectTextEncoding` 检测编码格式
@param[out] errString 错误信息
@return 是否转换成功
@fn bool DTextEncoding::convertFileEncoding(const QString &fileName, const QByteArray &toEncoding, const QByteArray &fromEncoding, QString *errString)
@brief 读取输入的 `fileName` 文件内容,将文件内容从 `fromEncoding` 编码格式转换到 `toEncoding` 编码格式,转换后的文本保存到 `fileName` 。
若转换过程中出现错误,将返回 false , 并设置 `errString` 错误信息。
@param[in] fileName 传入及保存的文件路径
@param[in] toEncoding 转换的编码格式
@param[in] fromEncoding 原始的编码格式,为空时会通过 `DTextEncoding::detectTextEncoding` 检测编码格式
@param[out] errString 错误信息
@return 是否转换成功
@sa DTextEncoding::convertTextEncoding
@fn bool DTextEncoding::convertFileEncodingTo(const QString &fromFile, const QString &toFile, const QByteArray &toEncoding, const QByteArray &fromEncoding, QString *errString)
@brief 读取输入的 `fromFile` 文件内容,将文件内容从 `fromEncoding` 编码格式转换到 `toEncoding` 编码格式,转换后的文本保存到 `toFile` 。
若转换过程中出现错误,将返回 false , 并设置 `errString` 错误信息。
@param[in] fromFile 传入的文件路径
@param[in] toFile 保存的文件路径
@param[in] toEncoding 转换的编码格式
@param[in] fromEncoding 原始的编码格式,为空时会通过 `DTextEncoding::detectTextEncoding` 检测编码格式
@param[out] errString 错误信息
@return 是否转换成功
@sa DTextEncoding::convertTextEncoding
*/

View File

@ -1,2 +1,3 @@
add_subdirectory(dasync-example)
add_subdirectory(expintf-example)
add_subdirectory(textcodec-example)

View File

@ -0,0 +1,18 @@
set(BINNAME textcodec)
find_package(Qt5 REQUIRED COMPONENTS Core)
add_executable(${BINNAME}
main.cpp
)
target_link_libraries(
${BINNAME} PRIVATE
Qt5::Core
dtkcore
)
target_include_directories(${BINNAME} PUBLIC
../../include/util/
../../include/
)

View File

@ -0,0 +1,103 @@
// SPDX-FileCopyrightText: 2022 UnionTech Software Technology Co., Ltd.
//
// SPDX-License-Identifier: LGPL-3.0-or-later
#include <DTextEncoding>
#include <QCoreApplication>
#include <QCommandLineOption>
#include <QCommandLineParser>
#include <QDebug>
DCORE_USE_NAMESPACE
void convertFileEncoding(const QString &fromFile,
const QString &toFile,
const QByteArray &toEncoding,
const QByteArray &fromEncoding = QByteArray())
{
QByteArray contentEncoding = fromEncoding;
if (contentEncoding.isEmpty()) {
bool isOk = false;
contentEncoding = DTextEncoding::detectFileEncoding(fromFile, &isOk);
if (!isOk) {
qInfo().noquote() << QString("Detect file %1 encoding failed").arg(fromFile);
return;
}
}
QString errString;
if (!DTextEncoding::convertFileEncodingTo(fromFile, toFile, toEncoding, contentEncoding, &errString)) {
qInfo().noquote() << QString("Convert file %1 encoding from %2 to %3 failed. error: %4")
.arg(fromFile)
.arg(QString::fromUtf8(contentEncoding))
.arg(QString::fromUtf8(toEncoding))
.arg(errString);
} else {
qInfo().noquote() << QString("Convert file %1 encoding from %2 to %3 successed.")
.arg(fromFile)
.arg(QString::fromUtf8(contentEncoding))
.arg(QString::fromUtf8(toEncoding));
}
}
int main(int argc, char *argv[])
{
QCoreApplication app(argc, argv);
QCoreApplication::setApplicationName("Text codec");
QCommandLineOption toEncodingOption({"t", "toEncoding"}, "Convert file encoding to specified encoding.", "encoding");
QCommandLineOption fromEncodingOption({"f", "fromEncoding"}, "Convert file encoding from specified encoding.", "encoding");
QCommandLineOption outputOption(
{"o", "output"}, "Save converted text with file path, only supported when opening a single file.", "path");
QCommandLineParser parser;
parser.setApplicationDescription("Text codec, provide encoding detection and encoding conversion.");
parser.addHelpOption();
parser.addOption(toEncodingOption);
parser.addOption(fromEncodingOption);
parser.addOption(outputOption);
parser.addPositionalArgument("file", "Open file.", "[file...]");
parser.process(app);
const QStringList args = parser.positionalArguments();
if (args.isEmpty()) {
parser.showHelp();
return 0;
}
const QStringList fileArgs = parser.positionalArguments();
if (fileArgs.isEmpty()) {
qInfo().noquote() << "Not set open file.";
return 0;
}
if (parser.isSet(outputOption)) {
if (fileArgs.size() > 1) {
qInfo().noquote() << "Output file path only supported when opening a single file.";
return 0;
} else if (!parser.isSet(toEncodingOption)) {
qInfo().noquote() << "Convert file with not set convert encoding.";
} else {
QString fromFile = fileArgs.first();
QString toFile = parser.value(outputOption);
QByteArray toEncoding = parser.value(toEncodingOption).toUtf8();
convertFileEncoding(fromFile, toFile, toEncoding, parser.value(fromEncodingOption).toUtf8());
return 0;
}
}
QByteArray toEncoding = parser.value(toEncodingOption).toUtf8();
for (QString fileName : fileArgs) {
if (toEncoding.isEmpty()) {
// Only display file encoding.
qInfo().noquote() << fileName << DTextEncoding::detectFileEncoding(fileName);
} else {
// Convert file encoding.
convertFileEncoding(fileName, fileName, toEncoding, parser.value(fromEncodingOption).toUtf8());
}
}
return 0;
}

View File

@ -0,0 +1 @@
#include "dtextencoding.h"

View File

@ -0,0 +1,39 @@
// SPDX-FileCopyrightText: 2022 UnionTech Software Technology Co., Ltd.
//
// SPDX-License-Identifier: LGPL-3.0-or-later
#ifndef DTEXTENCODING_H
#define DTEXTENCODING_H
#include <dtkcore_global.h>
#include <QString>
#include <QByteArray>
DCORE_BEGIN_NAMESPACE
class LIBDTKCORESHARED_EXPORT DTextEncoding
{
public:
static QByteArray detectTextEncoding(const QByteArray &content);
static QByteArray detectFileEncoding(const QString &fileName, bool *isOk = nullptr);
static bool convertTextEncoding(QByteArray &content,
QByteArray &outContent,
const QByteArray &toEncoding,
const QByteArray &fromEncoding = QByteArray(),
QString *errString = nullptr);
static bool convertFileEncoding(const QString &fileName,
const QByteArray &toEncoding,
const QByteArray &fromEncoding = QByteArray(),
QString *errString = nullptr);
static bool convertFileEncodingTo(const QString &fromFile,
const QString &toFile,
const QByteArray &toEncoding,
const QByteArray &fromEncoding = QByteArray(),
QString *errString = nullptr);
};
DCORE_END_NAMESPACE
#endif // DTEXTENCODING_H

View File

@ -15,6 +15,8 @@ BuildRequires: annobin
BuildRequires: pkgconfig(Qt5Core)
BuildRequires: pkgconfig(gsettings-qt)
BuildRequires: gtest-devel
BuildRequires: uchardet-devel
BuildRequires: libicu-devel
# since f30
Obsoletes: deepin-tool-kit <= 0.3.3

View File

@ -19,6 +19,11 @@ find_package(Qt5 REQUIRED COMPONENTS DBus)
endif()
find_package(Qt5 REQUIRED COMPONENTS Xml)
# start text encoding
find_package(ICU REQUIRED COMPONENTS uc)
pkg_check_modules(uchardet REQUIRED uchardet)
# end text encoding
# start base
include(base/base.cmake)
# end base
@ -66,6 +71,8 @@ if(LINUX)
Qt5::DBus
Qt5::Xml
${QGSettings_LIBRARIES}
ICU::uc
uchardet
)
else()
add_library(${LIBNAME} SHARED
@ -81,6 +88,8 @@ else()
${LIBNAME} PRIVATE
Qt5::Core
Qt5::Xml
ICU::uc
uchardet
)
endif()
set_target_properties(${LIBNAME} PROPERTIES
@ -90,6 +99,7 @@ set_target_properties(${LIBNAME} PROPERTIES
target_include_directories( ${LIBNAME} PUBLIC
${QGSettings_INCLUDE_DIRS}
${Qt5Core_PRIVATE_INCLUDE_DIRS}
${uchardet_INCLUDE_DIRS}
../include/util/
../include/dci/
../include/log/

446
src/util/dtextencoding.cpp Normal file
View File

@ -0,0 +1,446 @@
// SPDX-FileCopyrightText: 2022 UnionTech Software Technology Co., Ltd.
//
// SPDX-License-Identifier: LGPL-3.0-or-later
#include "dtextencoding.h"
#include <QtMath>
#include <QFile>
#include <QTextCodec>
#include <QLibrary>
#include <unicode/ucsdet.h>
#include <uchardet/uchardet.h>
#include <iconv.h>
DCORE_BEGIN_NAMESPACE
class LibICU
{
public:
LibICU();
~LibICU();
bool isValid();
bool detectEncoding(const QByteArray &content, QByteArrayList &charset);
UCharsetDetector *(*ucsdet_open)(UErrorCode *status);
void (*ucsdet_close)(UCharsetDetector *ucsd);
void (*ucsdet_setText)(UCharsetDetector *ucsd, const char *textIn, int32_t len, UErrorCode *status);
const UCharsetMatch **(*ucsdet_detectAll)(UCharsetDetector *ucsd, int32_t *matchesFound, UErrorCode *status);
const char *(*ucsdet_getName)(const UCharsetMatch *ucsm, UErrorCode *status);
int32_t (*ucsdet_getConfidence)(const UCharsetMatch *ucsm, UErrorCode *status);
private:
QLibrary *icuuc = nullptr;
Q_DISABLE_COPY(LibICU);
};
class Libuchardet
{
public:
Libuchardet();
~Libuchardet();
bool isValid();
QByteArray detectEncoding(const QByteArray &content);
uchardet_t (*uchardet_new)(void);
void (*uchardet_delete)(uchardet_t ud);
int (*uchardet_handle_data)(uchardet_t ud, const char *data, size_t len);
void (*uchardet_data_end)(uchardet_t ud);
void (*uchardet_reset)(uchardet_t ud);
const char *(*uchardet_get_charset)(uchardet_t ud);
private:
QLibrary *uchardet = nullptr;
Q_DISABLE_COPY(Libuchardet);
};
Q_GLOBAL_STATIC(LibICU, LibICUInstance);
Q_GLOBAL_STATIC(Libuchardet, LibuchardetInstance);
LibICU::LibICU()
{
// Load libicuuc.so
icuuc = new QLibrary("libicuuc");
if (!icuuc->load()) {
delete icuuc;
icuuc = nullptr;
return;
}
auto initFunctionError = [this]() {
icuuc->unload();
delete icuuc;
icuuc = nullptr;
};
#define INIT_FUNCTION(Name) \
Name = reinterpret_cast<decltype(Name)>(icuuc->resolve(#Name)); \
if (!Name) { \
initFunctionError(); \
return; \
}
INIT_FUNCTION(ucsdet_open);
INIT_FUNCTION(ucsdet_close);
INIT_FUNCTION(ucsdet_setText);
INIT_FUNCTION(ucsdet_detectAll);
INIT_FUNCTION(ucsdet_getName);
INIT_FUNCTION(ucsdet_getConfidence);
}
LibICU::~LibICU()
{
if (icuuc) {
delete icuuc;
}
}
bool LibICU::isValid()
{
return (icuuc);
}
bool LibICU::detectEncoding(const QByteArray &content, QByteArrayList &charset)
{
UErrorCode status = U_ZERO_ERROR;
UCharsetDetector *detector = ucsdet_open(&status);
if (U_FAILURE(status)) {
return false;
}
ucsdet_setText(detector, content.data(), content.size(), &status);
if (U_FAILURE(status)) {
ucsdet_close(detector);
return false;
}
int32_t matchCount = 0;
const UCharsetMatch **charsetMatch = ucsdet_detectAll(detector, &matchCount, &status);
if (U_FAILURE(status)) {
ucsdet_close(detector);
return false;
}
int recrodCount = qMin(3, matchCount);
for (int i = 0; i < recrodCount; i++) {
const char *encoding = ucsdet_getName(charsetMatch[i], &status);
if (U_FAILURE(status)) {
ucsdet_close(detector);
return false;
}
charset << QByteArray(encoding);
}
ucsdet_close(detector);
return true;
}
Libuchardet::Libuchardet()
{
uchardet = new QLibrary("libuchardet", "0");
if (!uchardet->load()) {
delete uchardet;
uchardet = nullptr;
return;
}
auto initFunctionError = [this]() {
uchardet->unload();
delete uchardet;
uchardet = nullptr;
};
#define INIT_FUNCTION(Name) \
Name = reinterpret_cast<decltype(Name)>(uchardet->resolve(#Name)); \
if (!Name) { \
initFunctionError(); \
return; \
}
INIT_FUNCTION(uchardet_new);
INIT_FUNCTION(uchardet_delete);
INIT_FUNCTION(uchardet_handle_data);
INIT_FUNCTION(uchardet_data_end);
INIT_FUNCTION(uchardet_reset);
INIT_FUNCTION(uchardet_get_charset);
}
Libuchardet::~Libuchardet()
{
if (uchardet) {
delete uchardet;
}
}
bool Libuchardet::isValid()
{
return uchardet;
}
QByteArray Libuchardet::detectEncoding(const QByteArray &content)
{
QByteArray charset;
uchardet_t handle = uchardet_new();
if (0 == uchardet_handle_data(handle, content.data(), content.size())) {
uchardet_data_end(handle);
charset = QByteArray(uchardet_get_charset(handle));
}
uchardet_delete(handle);
return charset;
}
QByteArray selectCharset(const QByteArray &charset, const QByteArrayList &icuCharsetList)
{
if (icuCharsetList.isEmpty()) {
return charset;
}
static QByteArray encodingGB18030("GB18030");
if (charset.isEmpty()) {
return icuCharsetList.contains(encodingGB18030) ? encodingGB18030 : icuCharsetList[0];
} else {
if (charset.contains(icuCharsetList[0])) {
return charset;
} else {
return icuCharsetList[0].contains(charset) ? icuCharsetList[0] : charset;
}
}
return QByteArray();
}
QByteArray DTextEncoding::detectTextEncoding(const QByteArray &content)
{
if (content.isEmpty()) {
return QByteArray("UTF-8");
}
QByteArray charset;
if (LibuchardetInstance()->isValid()) {
charset = LibuchardetInstance()->detectEncoding(content);
}
if (LibICUInstance()->isValid()) {
QByteArrayList icuCharsetList;
if (LibICUInstance()->detectEncoding(content, icuCharsetList)) {
if (charset.isEmpty() && !icuCharsetList.isEmpty()) {
charset = icuCharsetList.first();
} else {
// Improve GB18030 encoding recognition rate.
charset = selectCharset(charset, icuCharsetList);
}
}
}
if (charset.isEmpty()) {
QTextCodec *codec = QTextCodec::codecForUtfText(content);
if (codec) {
return codec->name();
}
}
// Use default encoding.
if (charset.isEmpty() || charset.contains("ASCII")) {
charset = QByteArray("UTF-8");
}
return charset;
}
QByteArray DTextEncoding::detectFileEncoding(const QString &fileName, bool *isOk)
{
QFile file(fileName);
if (!file.open(QFile::ReadOnly | QFile::Text)) {
if (isOk) {
*isOk = false;
}
return QByteArray();
}
// At most 64Kb data.
QByteArray content = file.read(qMin<int>(file.size(), USHRT_MAX));
file.close();
if (isOk) {
*isOk = true;
}
return detectTextEncoding(content);
}
bool DTextEncoding::convertTextEncoding(
QByteArray &content, QByteArray &outContent, const QByteArray &toEncoding, const QByteArray &fromEncoding, QString *errString)
{
if (content.isEmpty() || fromEncoding == toEncoding) {
return true;
}
if (toEncoding.isEmpty()) {
if (errString) {
*errString = QStringLiteral("The encode that convert to is empty.");
}
return false;
}
QByteArray contentEncoding = fromEncoding;
if (contentEncoding.isEmpty()) {
contentEncoding = detectTextEncoding(content);
}
// iconv set errno when failed.
iconv_t handle = iconv_open(toEncoding.data(), contentEncoding.data());
if (reinterpret_cast<iconv_t>(-1) != handle) {
size_t inBytesLeft = static_cast<size_t>(content.size()) + 1;
char *inbuf = content.data();
size_t outBytesLeft = inBytesLeft * 4;
char *outbuf = new char[outBytesLeft];
char *bufferHeader = outbuf;
size_t maxBufferSize = outBytesLeft;
size_t ret = iconv(handle, &inbuf, &inBytesLeft, &outbuf, &outBytesLeft);
int converError = 0;
if (static_cast<size_t>(-1) == ret) {
converError = errno;
if (errString) {
switch (converError) {
case EILSEQ:
*errString = QStringLiteral("An invalid multibyte sequence has been encountered in the input.");
break;
case EINVAL:
*errString = QStringLiteral("An incomplete multibyte sequence has been encountered in the input.");
break;
case E2BIG:
*errString = QStringLiteral("There is not sufficient room at *outbuf.");
break;
default:
break;
}
}
}
iconv_close(handle);
// For other error, user decides to keep or remove converted text.
if (EILSEQ == converError) {
delete[] bufferHeader;
return false;
} else {
// Use iconv converted byte count.
size_t realConvertSize = maxBufferSize - outBytesLeft - 1;
outContent = QByteArray(bufferHeader, realConvertSize);
delete[] bufferHeader;
return true;
}
} else {
if (EINVAL == errno && errString) {
*errString = QStringLiteral("The conversion from fromcode to tocode is not supported by the implementation.");
}
return false;
}
return true;
}
bool DTextEncoding::convertFileEncoding(const QString &fileName,
const QByteArray &toEncoding,
const QByteArray &fromEncoding,
QString *errString)
{
if (fromEncoding == toEncoding) {
return true;
}
QFile file(fileName);
if (!file.open(QFile::ReadWrite | QFile::Text)) {
if (errString) {
*errString = file.errorString();
file.error();
}
return false;
}
QByteArray content = file.readAll();
QByteArray outContent;
if (!convertTextEncoding(content, outContent, toEncoding, fromEncoding, errString)) {
file.close();
return false;
}
file.seek(0);
file.write(outContent);
file.resize(outContent.size());
file.close();
if (QFile::NoError != file.error()) {
if (errString) {
*errString = file.errorString();
}
return false;
}
return true;
}
bool DTextEncoding::convertFileEncodingTo(const QString &fromFile,
const QString &toFile,
const QByteArray &toEncoding,
const QByteArray &fromEncoding,
QString *errString)
{
if (fromEncoding == toEncoding) {
return true;
}
if (fromFile == toFile) {
return convertFileEncoding(fromFile, toEncoding, fromEncoding, errString);
}
// Check from file and to file before convert.
QFile readFile(fromFile);
if (!readFile.open(QFile::ReadOnly | QFile::Text)) {
if (errString) {
*errString = QString("Open convert from file failed, %1").arg(readFile.errorString());
}
return false;
}
QFile writeFile(toFile);
if (!writeFile.open(QFile::WriteOnly | QFile::Text)) {
readFile.close();
if (errString) {
*errString = QString("Open convert to file failed, %1").arg(writeFile.errorString());
}
return false;
}
QByteArray content = readFile.readAll();
readFile.close();
QByteArray outContent;
if (!convertTextEncoding(content, outContent, toEncoding, fromEncoding, errString)) {
writeFile.close();
writeFile.remove();
return false;
}
writeFile.write(outContent);
writeFile.close();
if (QFile::NoError != writeFile.error()) {
if (errString) {
*errString = writeFile.errorString();
}
return false;
}
return true;
}
DCORE_END_NAMESPACE

View File

@ -16,6 +16,7 @@ if(LINUX)
${CMAKE_CURRENT_LIST_DIR}/ddbusinterface.cpp
${CMAKE_CURRENT_LIST_DIR}/ddbusextendedabstractinterface.cpp
${CMAKE_CURRENT_LIST_DIR}/ddbusextendedpendingcallwatcher.cpp
${CMAKE_CURRENT_LIST_DIR}/dtextencoding.cpp
)
else()
set(UTILS_SOURCE
@ -35,6 +36,7 @@ else()
${CMAKE_CURRENT_LIST_DIR}/ddbusinterface.cpp
${CMAKE_CURRENT_LIST_DIR}/ddbusextendedabstractinterface.cpp
${CMAKE_CURRENT_LIST_DIR}/ddbusextendedpendingcallwatcher.cpp
${CMAKE_CURRENT_LIST_DIR}/dtextencoding.cpp
)
endif()
file(GLOB UTILS_HEADER

247
tests/ut_dtextencoding.cpp Normal file
View File

@ -0,0 +1,247 @@
// SPDX-FileCopyrightText: 2022 UnionTech Software Technology Co., Ltd.
//
// SPDX-License-Identifier: LGPL-3.0-or-later
#include "dtextencoding.h"
#include <gtest/gtest.h>
#include <QLibrary>
#include <QFile>
#include <QTextCodec>
DCORE_USE_NAMESPACE
class ut_DTextEncoding : public testing::Test
{
public:
static void SetUpTestCase();
bool rewriteTempFile(const QByteArray &data);
void removeTempFile();
static bool canLoadUchardet;
static bool canLoadICU;
static QString tmpFileName;
static QByteArray dataGB18030;
static QByteArray dataEUC_JP;
static QByteArray dataKOI8_R;
protected:
void TearDown() override;
};
bool ut_DTextEncoding::canLoadUchardet = false;
bool ut_DTextEncoding::canLoadICU = false;
QString ut_DTextEncoding::tmpFileName = QString("/tmp/ut_DTextEncoding_temp.txt");
QByteArray ut_DTextEncoding::dataGB18030;
QByteArray ut_DTextEncoding::dataEUC_JP;
QByteArray ut_DTextEncoding::dataKOI8_R;
void ut_DTextEncoding::SetUpTestCase()
{
QLibrary uchardet("libuchardet", "0");
if (!uchardet.isLoaded()) {
canLoadUchardet = uchardet.load();
if (canLoadUchardet) {
uchardet.unload();
}
}
QLibrary icuuc("libicuuc");
if (icuuc.isLoaded()) {
canLoadICU = icuuc.load();
if (canLoadICU) {
icuuc.unload();
}
}
// Utf8 Text: 中文测试一二三四123456789abcdefgh
const QByteArray chineseUnicode("\u4e2d\u6587\u6d4b\u8bd5\u4e00\u4e8c\u4e09\u56db\u0031\u0032\u0033\u0034\u0035\u0036\u0037"
"\u0038\u0039\u0061\u0062\u0063\u0064\u0065\u0066\u0067\u0068");
// Utf8 Text: 日本語のテスト ワン ツー スリー フォー 123456789abcdefgh
const QByteArray japaneseUnicode(
"\u65e5\u672c\u8a9e\u306e\u30c6\u30b9\u30c8\u0020\u30ef\u30f3\u0020\u30c4\u30fc\u0020\u30b9\u30ea\u30fc\u0020\u30d5\u30a9"
"\u30fc\u0020\u0031\u0032\u0033\u0034\u0035\u0036\u0037\u0038\u0039\u0061\u0062\u0063\u0064\u0065\u0066\u0067\u0068");
// Utf8 Text: Русский тест раз два три четыре 123456789abcdefgh
const QByteArray russianUnicode(
"\u0420\u0443\u0441\u0441\u043a\u0438\u0439\u0020\u0442\u0435\u0441\u0442\u0020\u0440\u0430\u0437\u0020\u0434\u0432\u0430"
"\u0020\u0442\u0440\u0438\u0020\u0447\u0435\u0442\u044b\u0440\u0435\u0020\u0031\u0032\u0033\u0034\u0035\u0036\u0037\u0038"
"\u0039\u0061\u0062\u0063\u0064\u0065\u0066\u0067\u0068");
QTextCodec *codec = QTextCodec::codecForName("GB18030");
if (codec) {
dataGB18030 = codec->fromUnicode(QString::fromUtf8(chineseUnicode));
}
codec = QTextCodec::codecForName("EUC-JP");
if (codec) {
dataEUC_JP = codec->fromUnicode(QString::fromUtf8(japaneseUnicode));
}
codec = QTextCodec::codecForName("KOI8-R");
if (codec) {
dataKOI8_R = codec->fromUnicode(QString::fromUtf8(russianUnicode));
}
}
bool ut_DTextEncoding::rewriteTempFile(const QByteArray &data)
{
QFile tmpFile(tmpFileName);
if (!tmpFile.open(QFile::WriteOnly | QFile::Truncate)) {
return false;
}
tmpFile.write(data);
tmpFile.close();
return true;
}
void ut_DTextEncoding::removeTempFile()
{
if (QFile::exists(tmpFileName)) {
QFile::remove(tmpFileName);
}
}
void ut_DTextEncoding::TearDown()
{
removeTempFile();
}
TEST_F(ut_DTextEncoding, testDetectTextEncode)
{
// Default encoding is utf-8.
ASSERT_EQ("UTF-8", DTextEncoding::detectTextEncoding(""));
ASSERT_EQ("UTF-8", DTextEncoding::detectTextEncoding("12345678ABCDEFG"));
ASSERT_EQ("GB18030", DTextEncoding::detectTextEncoding(dataGB18030));
ASSERT_EQ("EUC-JP", DTextEncoding::detectTextEncoding(dataEUC_JP));
ASSERT_EQ("KOI8-R", DTextEncoding::detectTextEncoding(dataKOI8_R));
}
TEST_F(ut_DTextEncoding, testDetectTextEncodeWithUchardet)
{
if (canLoadUchardet) {
QByteArray uchardetEncoding("EUC-TW");
// QTextCodec not suppotted EUC-TW.
QTextCodec *codec = QTextCodec::codecForName(uchardetEncoding);
ASSERT_EQ(codec, nullptr);
// Utf8 text: 繁體中文測試一二三四
QByteArray dataZhTraditional("\u7e41\u9ad4\u4e2d\u6587\u6e2c\u8a66\u4e00\u4e8c\u4e09\u56db");
QByteArray dataEUC_TW;
ASSERT_TRUE(DTextEncoding::convertTextEncoding(dataZhTraditional, dataEUC_TW, uchardetEncoding));
ASSERT_EQ(uchardetEncoding, DTextEncoding::detectTextEncoding(dataEUC_TW));
QByteArray convertGB18030;
ASSERT_TRUE(DTextEncoding::convertTextEncoding(dataEUC_TW, convertGB18030, "GB18030"));
QByteArray convertUTF8;
ASSERT_TRUE(DTextEncoding::convertTextEncoding(convertGB18030, convertUTF8, "UTF-8"));
ASSERT_EQ(dataZhTraditional, convertUTF8);
}
}
TEST_F(ut_DTextEncoding, testDetectFileEncode)
{
bool isOk = false;
ASSERT_EQ("", DTextEncoding::detectFileEncoding(tmpFileName, &isOk));
ASSERT_FALSE(isOk);
ASSERT_TRUE(rewriteTempFile(""));
ASSERT_EQ("UTF-8", DTextEncoding::detectFileEncoding(tmpFileName, &isOk));
ASSERT_TRUE(isOk);
ASSERT_TRUE(rewriteTempFile(dataGB18030));
ASSERT_EQ("GB18030", DTextEncoding::detectFileEncoding(tmpFileName, &isOk));
ASSERT_TRUE(isOk);
ASSERT_TRUE(rewriteTempFile(dataEUC_JP));
ASSERT_EQ("EUC-JP", DTextEncoding::detectFileEncoding(tmpFileName, &isOk));
ASSERT_TRUE(isOk);
ASSERT_TRUE(rewriteTempFile(dataKOI8_R));
ASSERT_EQ("KOI8-R", DTextEncoding::detectFileEncoding(tmpFileName, &isOk));
ASSERT_TRUE(isOk);
}
TEST_F(ut_DTextEncoding, testConvertTextEncoding)
{
QByteArray dataUTF_8;
ASSERT_TRUE(DTextEncoding::convertTextEncoding(dataGB18030, dataUTF_8, "UTF-8"));
ASSERT_EQ("UTF-8", DTextEncoding::detectTextEncoding(dataUTF_8));
QTextCodec *codec = QTextCodec::codecForName("GB18030");
ASSERT_EQ(codec->toUnicode(dataGB18030).toUtf8(), dataUTF_8);
QByteArray convertedGB18030;
ASSERT_TRUE(DTextEncoding::convertTextEncoding(dataUTF_8, convertedGB18030, "GB18030"));
ASSERT_EQ(dataGB18030, convertedGB18030);
// Convert with multi bytes encoding.
QByteArray dataUTF_16;
ASSERT_TRUE(DTextEncoding::convertTextEncoding(dataUTF_8, dataUTF_16, "UTF-16"));
ASSERT_EQ("UTF-16", DTextEncoding::detectTextEncoding(dataUTF_16));
codec = QTextCodec::codecForName("UTF-16");
ASSERT_EQ(codec->toUnicode(dataUTF_16).toUtf8(), dataUTF_8);
QByteArray convertedUTF8;
ASSERT_TRUE(DTextEncoding::convertTextEncoding(dataUTF_16, convertedUTF8, "UTF-8"));
ASSERT_EQ(dataUTF_8, convertedUTF8);
QByteArray dataUTF_32;
ASSERT_TRUE(DTextEncoding::convertTextEncoding(dataUTF_8, dataUTF_32, "UTF-32"));
ASSERT_EQ("UTF-32", DTextEncoding::detectTextEncoding(dataUTF_32));
codec = QTextCodec::codecForName("UTF-32");
ASSERT_EQ(codec->toUnicode(dataUTF_32).toUtf8(), dataUTF_8);
ASSERT_TRUE(DTextEncoding::convertTextEncoding(dataUTF_16, convertedUTF8, "UTF-8"));
ASSERT_EQ(dataUTF_8, convertedUTF8);
}
TEST_F(ut_DTextEncoding, testConvertTextEncodingWithError)
{
QByteArray dataUTF_8;
ASSERT_FALSE(DTextEncoding::convertTextEncoding(dataGB18030, dataUTF_8, "ERROR"));
ASSERT_FALSE(DTextEncoding::convertTextEncoding(dataGB18030, dataUTF_8, "KOI8-R"));
}
TEST_F(ut_DTextEncoding, testConvertFileEncoding)
{
ASSERT_TRUE(rewriteTempFile(dataGB18030));
ASSERT_TRUE(DTextEncoding::convertFileEncoding(tmpFileName, "UTF-8"));
ASSERT_EQ("UTF-8", DTextEncoding::detectFileEncoding(tmpFileName));
ASSERT_TRUE(DTextEncoding::convertFileEncoding(tmpFileName, "UTF-32"));
ASSERT_EQ("UTF-32", DTextEncoding::detectFileEncoding(tmpFileName));
ASSERT_FALSE(DTextEncoding::convertFileEncoding("", "UTF-32"));
}
TEST_F(ut_DTextEncoding, testConvertFileEncodingTo)
{
QString tmpConvertFileName("/tmp/ut_DTextEncoding_temp_testConvertFileEncodingTo.txt");
ASSERT_TRUE(rewriteTempFile(dataGB18030));
ASSERT_TRUE(DTextEncoding::convertFileEncodingTo(tmpFileName, tmpConvertFileName, "GB18030"));
ASSERT_TRUE(QFile::exists(tmpConvertFileName));
ASSERT_TRUE(DTextEncoding::convertFileEncodingTo(tmpFileName, tmpConvertFileName, "UTF-8"));
ASSERT_TRUE(QFile::exists(tmpConvertFileName));
ASSERT_EQ("UTF-8", DTextEncoding::detectFileEncoding(tmpConvertFileName));
ASSERT_TRUE(DTextEncoding::convertFileEncodingTo(tmpFileName, tmpConvertFileName, "UTF-32"));
ASSERT_EQ("UTF-32", DTextEncoding::detectFileEncoding(tmpConvertFileName));
ASSERT_TRUE(QFile::remove(tmpConvertFileName));
}
TEST_F(ut_DTextEncoding, testConvertFileEncodingToWithError)
{
QString tmpConvertFileName("/tmp/ut_DTextEncoding_temp_testConvertFileEncodingToWithError.txt");
ASSERT_FALSE(DTextEncoding::convertFileEncodingTo("", tmpConvertFileName, "UTF-32"));
ASSERT_FALSE(DTextEncoding::convertFileEncodingTo(tmpFileName, "", "UTF-32"));
ASSERT_FALSE(DTextEncoding::convertFileEncodingTo(tmpFileName, tmpConvertFileName, "ERROR"));
ASSERT_FALSE(QFile::exists(tmpConvertFileName));
ASSERT_TRUE(rewriteTempFile(dataGB18030));
ASSERT_FALSE(DTextEncoding::convertFileEncodingTo(tmpFileName, tmpConvertFileName, "EUC-JP"));
ASSERT_FALSE(QFile::exists(tmpConvertFileName));
}