Run file classification in parallel threads

Our file classification is not exactly in large quantities. Fedora's
kernel-debuginfo-common has circa 25000 files, and classifying them
serially takes about a minute on my rusty old T520. With parallelization
this goes down to ~24s.

It's all remarkably simple, except for the fact that libmagic is not
thread-safe so we need separate magic handles for each of our threads.
This will leak those libmagic handles on error situations, I don't see
any obvious, nice way to handle that.
This commit is contained in:
Panu Matilainen 2019-05-08 17:06:21 +03:00 committed by Florian Festi
parent 18c92e6fe7
commit 41f0e214f2
1 changed files with 22 additions and 8 deletions

View File

@ -677,6 +677,7 @@ static void rpmfcAttributes(rpmfc fc, int ix, const char *ftype, const char *ful
/* Add attributes on libmagic type & path pattern matches */
if (matches(&(*attr)->incl, ftype, path, is_executable)) {
argvAddTokens(&fc->fattrs[ix], (*attr)->name);
#pragma omp critical(fahash)
fattrHashAddEntry(fc->fahash, attr-fc->atypes, ix);
}
}
@ -1063,7 +1064,7 @@ static int initAttrs(rpmfc fc)
rpmRC rpmfcClassify(rpmfc fc, ARGV_t argv, rpm_mode_t * fmode)
{
int msflags = MAGIC_CHECK | MAGIC_COMPRESS | MAGIC_NO_CHECK_TOKENS;
magic_t ms = NULL;
int nerrors = 0;
rpmRC rc = RPMRC_FAIL;
if (fc == NULL) {
@ -1094,18 +1095,23 @@ rpmRC rpmfcClassify(rpmfc fc, ARGV_t argv, rpm_mode_t * fmode)
/* Build (sorted) file class dictionary. */
fc->cdict = rpmstrPoolCreate();
ms = magic_open(msflags);
#pragma omp parallel
{
/* libmagic is not thread-safe, each thread needs to a private handle */
magic_t ms = magic_open(msflags);
if (ms == NULL) {
rpmlog(RPMLOG_ERR, _("magic_open(0x%x) failed: %s\n"),
msflags, strerror(errno));
goto exit;
#pragma omp cancel parallel
}
if (magic_load(ms, NULL) == -1) {
rpmlog(RPMLOG_ERR, _("magic_load failed: %s\n"), magic_error(ms));
goto exit;
#pragma omp cancel parallel
}
#pragma omp for reduction(+:nerrors)
for (int ix = 0; ix < fc->nfiles; ix++) {
rpmsid ftypeId;
const char * ftype;
@ -1148,7 +1154,8 @@ rpmRC rpmfcClassify(rpmfc fc, ARGV_t argv, rpm_mode_t * fmode)
s, mode, magic_error(ms));
/* only executable files are critical to dep extraction */
if (is_executable) {
goto exit;
nerrors++;
#pragma omp cancel for
}
/* unrecognized non-executables get treated as "data" */
ftype = "data";
@ -1171,21 +1178,28 @@ rpmRC rpmfcClassify(rpmfc fc, ARGV_t argv, rpm_mode_t * fmode)
/* Add to file class dictionary and index array */
if (fcolor != RPMFC_WHITE && (fcolor & RPMFC_INCLUDE)) {
ftypeId = rpmstrPoolId(fc->cdict, ftype, 1);
#pragma omp atomic
fc->fknown++;
} else {
ftypeId = rpmstrPoolId(fc->cdict, "", 1);
#pragma omp atomic
fc->fwhite++;
}
/* Pool id's start from 1, for headers we want it from 0 */
fc->fcdictx[ix] = ftypeId - 1;
}
rc = RPMRC_OK;
if (ms != NULL)
magic_close(ms);
} /* omp parallel */
if (nerrors == 0)
rc = RPMRC_OK;
exit:
/* No more additions after this, freeze pool to minimize memory use */
rpmstrPoolFreeze(fc->cdict, 0);
if (ms != NULL)
magic_close(ms);
return rc;
}