-
-
Save ssokolow/0d9f5c5e4a8a37a962875af205bcc723 to your computer and use it in GitHub Desktop.
| /* POSIX paths in JSON via escaping which | |
| doesn't alter valid UTF-8 paths. | |
| The trick is recognizing that JSON can store binary nulls in strings | |
| but nulls are the only character that can't occur in POSIX paths, | |
| so we can use it as an escape character that won't change how existing | |
| serialized paths get interpreted. | |
| Copyright 2018-2020, Stephan Sokolow | |
| This code is released under your choice of the MIT or Apache-2.0 licenses. | |
| https://opensource.org/licenses/MIT | |
| https://opensource.org/licenses/Apache-2.0 | |
| */ | |
| use std::borrow::Cow; | |
| use std::ffi::{OsStr, OsString}; | |
| use std::str; | |
| // Platform-specific imports | |
| use std::os::unix::ffi::{OsStrExt, OsStringExt}; | |
| /// Escape an OS path into something which can safely be stored in a valid UTF-8 string | |
| fn escape_path<P: AsRef<OsStr> + ?Sized>(path: &P) -> Cow<'_, str> { | |
| escape_path_inner(path.as_ref()) | |
| } | |
| /// Inner function for `escape_path` to avoid the risk of monomorphization bloat | |
| /// | |
| /// Adapted from the example code on the `std::str::Utf8Error` rustdoc page | |
| /// TODO: Support Windows... ideally in a way that results in the same conversion logic | |
| /// as ntfs-3g uses. | |
| fn escape_path_inner(path: &OsStr) -> Cow<'_, str> { | |
| if let Some(path_str) = path.to_str() { | |
| if !path_str.contains('\0') { | |
| // In the by-far most common case, just do a validity check and a copy | |
| // (According to Criterion, this halves the common-case runtime in exchange for | |
| // a 6-20% (+/- 5%) slow-down in the case where escaping is needed.) | |
| // | |
| // An if/else here which directs "valid but with \0" to str::replace was shown | |
| // by Criterion to result in a ~33% slowdown for a test string with four \0 in it | |
| // and a ~45% slowdown for a test string with seven \0 in it. | |
| return Cow::from(path_str.to_owned()); | |
| } | |
| } | |
| // In the very uncommon case, make a copy of the string with invalid bytes escaped | |
| let mut input = path.as_bytes(); | |
| // Preallocate for four escapes | |
| // (Just a guess, based on four mojibake'd latin1 bytes, two UTF-16 surrogates, | |
| // or one UTF-32 character) | |
| let mut result = String::with_capacity(path.len().saturating_add(4)); | |
| loop { | |
| // Allowed because it should be impossible for invalid_sequence_length to | |
| // be outside the range of after_valid. Tests should cover all cases, so | |
| // it doesn't make sense to change the API to return a Result when the | |
| // fallible path should be impossible. | |
| // | |
| // TODO: Move this to the &after_valid[...] once attributes on expressions are no | |
| // longer experimental. | |
| #[allow(clippy::indexing_slicing)] | |
| match str::from_utf8(input) { | |
| // TODO: Try rewriting this into something faster | |
| Ok(valid) => { | |
| // Escape binary nulls inside a fully valid string so they round-trip properly | |
| for u_char in valid.chars() { | |
| if u_char == '\0' { result.push('\0'); } | |
| result.push(u_char); | |
| } | |
| break | |
| } | |
| Err(error) => { | |
| // Pass through the valid span | |
| let (valid, after_valid) = input.split_at(error.valid_up_to()); | |
| // Escape binary nulls inside the valid span so they round-trip properly | |
| #[allow(clippy::result_expect_used)] | |
| for u_char in str::from_utf8(valid).expect( | |
| "from_utf8 on left-hand output of valid_up_to()").chars() { | |
| if u_char == '\0' { result.push('\0'); } | |
| result.push(u_char); | |
| } | |
| // Escape any following invalid characters | |
| let invalid_sequence_length = match error.error_len() { | |
| Some(length) => length, | |
| None => after_valid.len() | |
| }; | |
| for &byte in after_valid.iter().take(invalid_sequence_length) { | |
| result.push('\0'); | |
| result.push(byte.into()); | |
| } | |
| // Step forward to the next span or end the loop | |
| if let Some(invalid_sequence_length) = error.error_len() { | |
| input = &after_valid[invalid_sequence_length..] | |
| } else { | |
| break | |
| } | |
| } | |
| } | |
| } | |
| return Cow::from(result); | |
| } | |
| /// Take the output from `escape_path` and change it back into an OS string | |
| /// | |
| /// (`allow(dead_code)` because its purpose is to exist on standby and pass unit tests, | |
| /// awaiting the need to unescape my emergency records.) | |
| #[allow(dead_code)] | |
| fn unescape_path(path: &str) -> Cow<'_, OsStr> { | |
| // In the by-far most common case, just check for \0 and return a Cow<OsStr> | |
| // (According to Criterion, skipping the unescaping code for the common case results in a 360%+ | |
| // speed-up in the common case with no statistically significant change in the case where | |
| // things need to be unescaped.) | |
| if !path.contains('\0') { | |
| return Cow::from(OsStr::new(path)); | |
| } | |
| // Otherwise, unescape the escaped bytes | |
| // TODO: Come up with a nicer way to do this | |
| let mut result: Vec<u8> = Vec::with_capacity(path.len()); | |
| let mut utf8_buf = [0_u8; 4]; | |
| let mut raw_next = false; | |
| for u_char in path.chars() { | |
| if raw_next { | |
| result.push(u_char as u8); | |
| raw_next = false; | |
| } else if u_char == '\0' { | |
| raw_next = true; | |
| } else { | |
| result.extend(u_char.encode_utf8(&mut utf8_buf).as_bytes()); | |
| } | |
| } | |
| return Cow::from(OsString::from_vec(result)); | |
| } | |
| #[cfg(test)] | |
| mod tests { | |
| use std::ffi::OsString; | |
| use std::os::unix::ffi::OsStringExt; | |
| use super::{escape_path, unescape_path}; | |
| const TEST_STRINGS: &[(&[u8], &str)] = &[ | |
| // all valid utf-8 | |
| (b"string with no invalid utf-8", "string with no invalid utf-8"), | |
| // typical string with invalid utf-8 | |
| (b"/un/fichier/fran\xe7ais", "/un/fichier/fran\0\u{00e7}ais"), | |
| // starting with invalid utf-8 | |
| (b"\xe7a va", "\0\u{00e7}a va"), | |
| // invalid span length > 1 | |
| (b"foo\xe7\xe7bar", "foo\0\u{00e7}\0\u{00e7}bar"), | |
| // only invalid characters | |
| (b"\xe7\xe7", "\0\u{00e7}\0\u{00e7}"), | |
| // empty string | |
| (b"", ""), | |
| // ending with invalid utf-8 less than 3 characters (see utf8error::error_len) | |
| (b"foo\xe7", "foo\0\u{00e7}"), | |
| (b"foo\xe7\xe7", "foo\0\u{00e7}\0\u{00e7}"), | |
| // ending with invalid utf-8 more than 3 characters (see utf8error::error_len) | |
| (b"foo\xe7\xe7\xe7\xe7", "foo\0\u{00e7}\0\u{00e7}\0\u{00e7}\0\u{00e7}"), | |
| // all valid utf-8, but with nulls | |
| (b"\0string with no\0\0invalid utf-8\0", "\0\0string with no\0\0\0\0invalid utf-8\0\0"), | |
| ]; | |
| /// Test that escape_path works properly | |
| #[test] | |
| fn test_escape_path() { | |
| for (input, expected) in TEST_STRINGS { | |
| let os_string = OsString::from_vec(input.to_vec()); | |
| let escaped = escape_path(&os_string); | |
| assert_eq!(escaped, *expected); | |
| } | |
| } | |
| /// Test that unescape_path is symmetrical to escape_path | |
| #[test] | |
| fn test_unescape_path() { | |
| for (input, _expected) in TEST_STRINGS { | |
| let os_string = OsString::from_vec(input.to_vec()); | |
| let escaped = escape_path(&os_string); | |
| assert_eq!(&unescape_path(&escaped), &os_string.as_os_str()); | |
| } | |
| } | |
| /// Test that nulls in valid UTF-8 round-trip successfully | |
| #[test] | |
| fn test_null_round_tripping() { | |
| let test_strings: &[(&[u8], &str)] = &[ | |
| (b"\0foo", "\0\0foo"), | |
| (b"foo\0bar", "foo\0\0bar"), | |
| (b"foo\0\0bar", "foo\0\0\0\0bar"), | |
| (b"foo\0", "foo\0\0"), | |
| (b"\0foo\0bar\xe7baz\0\0quux\0", "\0\0foo\0\0bar\0\u{00e7}baz\0\0\0\0quux\0\0"), | |
| ]; | |
| for (in_vec, expected_escaped) in test_strings { | |
| let os_string = OsString::from_vec(in_vec.to_vec()); | |
| let escaped = escape_path(&os_string); | |
| assert_eq!(&escaped, expected_escaped); | |
| let round_tripped = unescape_path(&escaped); | |
| assert_eq!(os_string, round_tripped); | |
| } | |
| } | |
| } |
Fixed. It now escapes \0 as \0\0 so it will...
- Successfully round-trip all valid
OsStr/OsStringcontents. - Leave all POSIX filesystem paths which are valid UTF-8 unchanged.
- Use
\0as an escape character to storeb"\xe7"as\0\u{00e7}. - Encode
\0as\0\0so it can be distinguished from use of\0as an escape character.
This should be fully backwards compatible with serde_json's existing behaviour, since serde_json fails if a Path or PathBuf contains invalid UTF-8.
Super interested in this gist as I am facing the same issue but with msgpack.
If I am not wrong the let mut input = path.as_bytes(); is Unix specific. How do you intend to handle the path on windows?
On the 2nd thought... may be it does not need special handling on windows because I think windows enforces UTF-16.
If I am not wrong the
let mut input = path.as_bytes();is Unix specific. How do you intend to handle the path on windows?On the 2nd thought... may be it does not need special handling on windows because I think windows enforces UTF-16.
Windows allows un-paired surrogates, which are also forbidden in UTF-8. (For compatibility with filenames generated back when Unicode was expected to only be a 16-bit fixed-width encoding, and the encoding used was UCS-2 rather than UTF-16.)
The Windows equivalent to as_bytes() is encode_wide() provided by the std::os::windows::ffi::OsStringExt trait.
The plan I keep not having time to enact is to look up how to generate such ill-formed paths if I borrow my brother's Windows 10 PC for a couple of minutes, put them on an NTFS-formatted flash drive, stick it into my Linux PC, and then replicate whatever behaviour the ntfs-3g NTFS driver implements for translating such filenames between the Linux and Windows worlds.
In the interest of interoperable, panic-free storage of POSIX paths in JSON or other UTF-8-requiring formats, I'm also willing to release this under other licenses if you need that.
Bear in mind that, as-is, it assumes it's receiving a path so it does not escape
\0as\0\0. I'm willing to add that if anyone wants something suitable for allOsStrandOsStringvalues that are unlikely to contain a\0but it can't be ruled out.