From a35c1a058da94a58ed98aa4dadc4b3ce6a2db86f Mon Sep 17 00:00:00 2001 From: Damien Elmes Date: Mon, 31 Jul 2023 12:02:51 +1000 Subject: [PATCH] Extract inline images as part of media check We also need to get to the bottom of what's causing this: https://forums.ankiweb.net/t/anki-browse-extremely-laggy/32533 --- Cargo.lock | 7 ++ Cargo.toml | 1 + cargo/licenses.json | 9 ++ ftl/core/media-check.ftl | 1 + rslib/Cargo.toml | 1 + rslib/src/media/check.rs | 241 ++++++++++++++++++++++++--------------- 6 files changed, 170 insertions(+), 90 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a13c99f81..4eb629c4c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -108,6 +108,7 @@ dependencies = [ "convert_case", "criterion", "csv", + "data-encoding", "difflib", "dirs", "envy", @@ -934,6 +935,12 @@ dependencies = [ "memchr", ] +[[package]] +name = "data-encoding" +version = "2.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2e66c9d817f1720209181c316d28635c050fa304f9c79e47a520882661b7308" + [[package]] name = "deadpool" version = "0.9.5" diff --git a/Cargo.toml b/Cargo.toml index ce1f8b1cf..ae1dded41 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -61,6 +61,7 @@ clap = { version = "4.3.10", features = ["derive"] } coarsetime = "0.1.23" convert_case = "0.6.0" criterion = { version = "0.5.1" } +data-encoding = "2.4.0" difflib = "0.4.0" flate2 = "1.0.26" fluent = "0.16.0" diff --git a/cargo/licenses.json b/cargo/licenses.json index e4841edea..0a262f6e2 100644 --- a/cargo/licenses.json +++ b/cargo/licenses.json @@ -494,6 +494,15 @@ "license_file": null, "description": "Bare bones CSV parsing with no_std support." }, + { + "name": "data-encoding", + "version": "2.4.0", + "authors": "Julien Cretin ", + "repository": "https://github.com/ia0/data-encoding", + "license": "MIT", + "license_file": null, + "description": "Efficient and customizable data-encoding functions like base64, base32, and hex" + }, { "name": "deadpool", "version": "0.9.5", diff --git a/ftl/core/media-check.ftl b/ftl/core/media-check.ftl index c930e740d..168ca11f6 100644 --- a/ftl/core/media-check.ftl +++ b/ftl/core/media-check.ftl @@ -14,6 +14,7 @@ media-check-unused-count = Unused files: { $count } media-check-renamed-count = Renamed files: { $count } media-check-oversize-count = Over 100MB: { $count } media-check-subfolder-count = Subfolders: { $count } +media-check-extracted-count = Extracted images: { $count } ## Shown at the top of each section diff --git a/rslib/Cargo.toml b/rslib/Cargo.toml index 13258d008..67be8d3b3 100644 --- a/rslib/Cargo.toml +++ b/rslib/Cargo.toml @@ -52,6 +52,7 @@ chrono.workspace = true coarsetime.workspace = true convert_case.workspace = true csv.workspace = true +data-encoding.workspace = true difflib.workspace = true dirs.workspace = true envy.workspace = true diff --git a/rslib/src/media/check.rs b/rslib/src/media/check.rs index 85d3bed70..0656c5059 100644 --- a/rslib/src/media/check.rs +++ b/rslib/src/media/check.rs @@ -6,16 +6,21 @@ use std::collections::HashMap; use std::collections::HashSet; use std::fs; use std::io; -use std::path::Path; use anki_i18n::without_unicode_isolation; +use anki_io::write_file; +use data_encoding::BASE64; +use once_cell::sync::Lazy; +use regex::Regex; use tracing::debug; +use tracing::info; use crate::error::DbErrorKind; use crate::latex::extract_latex_expanding_clozes; use crate::media::files::data_for_file; use crate::media::files::filename_if_normalized; use crate::media::files::normalize_nfc_filename; +use crate::media::files::sha1_of_data; use crate::media::files::trash_folder; use crate::media::MediaManager; use crate::prelude::*; @@ -24,6 +29,7 @@ use crate::sync::media::progress::MediaCheckProgress; use crate::sync::media::MAX_INDIVIDUAL_MEDIA_FILE_SIZE; use crate::text::extract_media_refs; use crate::text::normalize_to_nfc; +use crate::text::CowMapping; use crate::text::MediaRef; use crate::text::REMOTE_FILENAME; @@ -37,6 +43,7 @@ pub struct MediaCheckOutput { pub oversize: Vec, pub trash_count: u64, pub trash_bytes: u64, + pub inlined_image_count: u64, } #[derive(Debug, PartialEq, Eq, Default)] @@ -57,6 +64,7 @@ pub struct MediaChecker<'a> { col: &'a mut Collection, media: MediaManager, progress: ThrottlingProgressHandler, + inlined_image_count: u64, } impl MediaChecker<'_> { @@ -65,6 +73,7 @@ impl MediaChecker<'_> { media: col.media()?, progress: col.new_progress_handler(), col, + inlined_image_count: 0, }) } @@ -82,6 +91,7 @@ impl MediaChecker<'_> { oversize: folder_check.oversize, trash_count, trash_bytes, + inlined_image_count: self.inlined_image_count, }) } @@ -102,6 +112,11 @@ impl MediaChecker<'_> { buf += &tr.media_check_unused_count(output.unused.len()); buf.push('\n'); + if output.inlined_image_count > 0 { + buf += &tr.media_check_extracted_count(output.inlined_image_count); + buf.push('\n'); + } + if !output.renamed.is_empty() { buf += &tr.media_check_renamed_count(output.renamed.len()); buf.push('\n'); @@ -344,12 +359,7 @@ impl MediaChecker<'_> { .or_insert_with(Vec::new) .push(nid) }; - if fix_and_extract_media_refs( - &mut note, - &mut tracker, - renamed, - &self.media.media_folder, - )? { + if self.fix_and_extract_media_refs(&mut note, &mut tracker, renamed)? { // note was modified, needs saving note.prepare_for_update(nt, false)?; note.set_modified(usn); @@ -368,80 +378,102 @@ impl MediaChecker<'_> { Ok(referenced_files) } -} -/// Returns true if note was modified. -fn fix_and_extract_media_refs( - note: &mut Note, - mut tracker: impl FnMut(String), - renamed: &HashMap, - media_folder: &Path, -) -> Result { - let mut updated = false; + /// Returns true if note was modified. + fn fix_and_extract_media_refs( + &mut self, + note: &mut Note, + mut tracker: impl FnMut(String), + renamed: &HashMap, + ) -> Result { + let mut updated = false; - for idx in 0..note.fields().len() { - let field = normalize_and_maybe_rename_files( - ¬e.fields()[idx], - renamed, - &mut tracker, - media_folder, - ); - if let Cow::Owned(field) = field { - // field was modified, need to save - note.set_field(idx, field)?; - updated = true; - } - } - - Ok(updated) -} - -/// Convert any filenames that are not in NFC form into NFC, -/// and update any files that were renamed on disk. -fn normalize_and_maybe_rename_files<'a>( - field: &'a str, - renamed: &HashMap, - mut tracker: impl FnMut(String), - media_folder: &Path, -) -> Cow<'a, str> { - let refs = extract_media_refs(field); - let mut field: Cow = field.into(); - - for media_ref in refs { - if REMOTE_FILENAME.is_match(media_ref.fname) { - // skip remote references - continue; - } - - // normalize fname into NFC - let mut fname = normalize_to_nfc(&media_ref.fname_decoded); - // and look it up to see if it's been renamed - if let Some(new_name) = renamed.get(fname.as_ref()) { - fname = new_name.to_owned().into(); - } - // if the filename was in NFC and was not renamed as part of the - // media check, it may have already been renamed during a previous - // sync. If that's the case and the renamed version exists on disk, - // we'll need to update the field to match it. It may be possible - // to remove this check in the future once we can be sure all media - // files stored on AnkiWeb are in normalized form. - if matches!(fname, Cow::Borrowed(_)) { - if let Cow::Owned(normname) = normalize_nfc_filename(fname.as_ref().into()) { - let path = media_folder.join(&normname); - if path.exists() { - fname = normname.into(); - } + for idx in 0..note.fields().len() { + let field = + self.normalize_and_maybe_rename_files(¬e.fields()[idx], renamed, &mut tracker)?; + if let Cow::Owned(field) = field { + // field was modified, need to save + note.set_field(idx, field)?; + updated = true; } } - // update the field if the filename was modified - if let Cow::Owned(ref new_name) = fname { - field = rename_media_ref_in_field(field.as_ref(), &media_ref, new_name).into(); - } - // and mark this filename as having been referenced - tracker(fname.into_owned()); + + Ok(updated) } - field + /// Convert any filenames that are not in NFC form into NFC, + /// and update any files that were renamed on disk. + fn normalize_and_maybe_rename_files<'a>( + &mut self, + field: &'a str, + renamed: &HashMap, + mut tracker: impl FnMut(String), + ) -> Result> { + let refs = extract_media_refs(field); + let mut field: Cow = field.into(); + + for media_ref in refs { + if REMOTE_FILENAME.is_match(media_ref.fname) { + // skip remote references + continue; + } + + let mut fname = self.maybe_extract_inline_image(&media_ref.fname_decoded)?; + + // normalize fname into NFC + fname = fname.map_cow(normalize_to_nfc); + // and look it up to see if it's been renamed + if let Some(new_name) = renamed.get(fname.as_ref()) { + fname = new_name.to_owned().into(); + } + // if the filename was in NFC and was not renamed as part of the + // media check, it may have already been renamed during a previous + // sync. If that's the case and the renamed version exists on disk, + // we'll need to update the field to match it. It may be possible + // to remove this check in the future once we can be sure all media + // files stored on AnkiWeb are in normalized form. + if matches!(fname, Cow::Borrowed(_)) { + if let Cow::Owned(normname) = normalize_nfc_filename(fname.as_ref().into()) { + let path = self.media.media_folder.join(&normname); + if path.exists() { + fname = normname.into(); + } + } + } + // update the field if the filename was modified + if let Cow::Owned(ref new_name) = fname { + field = rename_media_ref_in_field(field.as_ref(), &media_ref, new_name).into(); + } + // and mark this filename as having been referenced + tracker(fname.into_owned()); + } + + Ok(field) + } + + fn maybe_extract_inline_image<'a>(&mut self, fname_decoded: &'a str) -> Result> { + static BASE64_IMG: Lazy = Lazy::new(|| { + Regex::new("(?i)^data:image/(jpg|jpeg|png|gif|webp);base64,(.+)$").unwrap() + }); + + let Some(caps) = BASE64_IMG.captures(fname_decoded) else { + return Ok(fname_decoded.into()); + }; + let (_all, [ext, data]) = caps.extract(); + let data = data.trim(); + let data = match BASE64.decode(data.as_bytes()) { + Ok(data) => data, + Err(err) => { + info!("invalid base64: {}", err); + return Ok(fname_decoded.into()); + } + }; + let checksum = hex::encode(sha1_of_data(&data)); + let external_fname = format!("paste-{checksum}.{ext}"); + write_file(self.media.media_folder.join(&external_fname), data)?; + self.inlined_image_count += 1; + Ok(external_fname.into()) + } } fn rename_media_ref_in_field(field: &str, media_ref: &MediaRef, new_name: &str) -> String { @@ -502,8 +534,10 @@ pub(crate) mod test { include_bytes!("../../tests/support/mediacheck.anki2"); use std::collections::HashMap; + use std::path::Path; use anki_io::create_dir; + use anki_io::read_to_string; use anki_io::write_file; use tempfile::tempdir; use tempfile::TempDir; @@ -558,7 +592,8 @@ pub(crate) mod test { dirs: vec!["folder".to_string()], oversize: vec![], trash_count: 0, - trash_bytes: 0 + trash_bytes: 0, + inlined_image_count: 0, } ); @@ -675,7 +710,8 @@ Unused: unused.jpg dirs: vec![], oversize: vec![], trash_count: 0, - trash_bytes: 0 + trash_bytes: 0, + inlined_image_count: 0, } ); assert!(fs::metadata(mgr.media_folder.join("ぱぱ.jpg")).is_ok()); @@ -693,7 +729,8 @@ Unused: unused.jpg dirs: vec![], oversize: vec![], trash_count: 0, - trash_bytes: 0 + trash_bytes: 0, + inlined_image_count: 0, } ); assert!(fs::metadata(mgr.media_folder.join("ぱぱ.jpg")).is_err()); @@ -703,31 +740,55 @@ Unused: unused.jpg Ok(()) } - fn normalize_and_maybe_rename_files_helper(field: &str) -> HashSet { + fn normalize_and_maybe_rename_files_helper( + checker: &mut MediaChecker, + field: &str, + ) -> HashSet { let mut seen = HashSet::new(); - normalize_and_maybe_rename_files( - field, - &HashMap::new(), - |fname| { + checker + .normalize_and_maybe_rename_files(field, &HashMap::new(), |fname| { seen.insert(fname); - }, - Path::new("/tmp"), - ); + }) + .unwrap(); seen } #[test] - fn html_encoding() { + fn html_encoding() -> Result<()> { + let (_dir, _mgr, mut col) = common_setup()?; + let mut checker = col.media_checker()?; + let mut field = "[sound:a & b.mp3]"; - let seen = normalize_and_maybe_rename_files_helper(field); + let seen = normalize_and_maybe_rename_files_helper(&mut checker, field); assert!(seen.contains("a & b.mp3")); field = r#""#; - let seen = normalize_and_maybe_rename_files_helper(field); + let seen = normalize_and_maybe_rename_files_helper(&mut checker, field); assert!(seen.contains("a&b.jpg")); field = r#""#; - let seen = normalize_and_maybe_rename_files_helper(field); + let seen = normalize_and_maybe_rename_files_helper(&mut checker, field); assert!(seen.contains("a&b.jpg")); + Ok(()) + } + + #[test] + fn inlined_images() -> Result<()> { + let (_dir, mgr, mut col) = common_setup()?; + NoteAdder::basic(&mut col) + // b'foo' + .fields(&["foo", ""]) + .add(&mut col); + let mut checker = col.media_checker()?; + let output = checker.check()?; + assert_eq!(output.inlined_image_count, 1); + assert_eq!( + &read_to_string( + mgr.media_folder + .join("paste-0beec7b5ea3f0fdbc95d0dd47f3c5bc275da8a33.jpg") + )?, + "foo" + ); + Ok(()) } }