Last active
January 23, 2026 07:10
-
-
Save righettod/c9d363f6ca0df348dfd47370972db870 to your computer and use it in GitHub Desktop.
Sample code trying to sanitize an HTML content intended to be passed to an "HTML to PDF" exporter.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import org.jsoup.Jsoup; | |
| import org.jsoup.nodes.Document; | |
| import org.jsoup.safety.Safelist; | |
| import org.jsoup.select.Elements; | |
| import java.net.URI; | |
| import java.net.URISyntaxException; | |
| import java.net.URLDecoder; | |
| import java.net.http.HttpClient; | |
| import java.net.http.HttpRequest; | |
| import java.net.http.HttpResponse; | |
| import java.nio.charset.Charset; | |
| import java.nio.file.Files; | |
| import java.nio.file.Path; | |
| import java.nio.file.Paths; | |
| import java.nio.file.StandardOpenOption; | |
| import java.time.Duration; | |
| import java.util.List; | |
| import java.util.Locale; | |
| import java.util.Optional; | |
| /** | |
| * Sample code trying to sanitize an HTML content intended to be passed to an "HTML to PDF" exporter.<br> | |
| * JSoup library is used to parse the HTML.<br> | |
| * The base assumption is that loading of an external image must be allowed, for which, domains cannot be constrainted. | |
| * | |
| * @see "https://jsoup.org/" | |
| * @see "https://mvnrepository.com/artifact/org.jsoup/jsoup" | |
| */ | |
| public class HtmlToPdfSanitizer { | |
| private static boolean isValidURL(String url) { | |
| boolean stringIsAnURL = false; | |
| try { | |
| URI u = new URI(url); | |
| stringIsAnURL = (u.getScheme() != null && !u.getScheme().trim().isEmpty()); | |
| } catch (URISyntaxException e) { | |
| stringIsAnURL = false; | |
| } | |
| return stringIsAnURL; | |
| } | |
| private static boolean isValidImage(URI url) { | |
| //By default consider that is not a valid image | |
| boolean resourceIsImage = false; | |
| List<String> acceptedImageContentTypes = List.of("image/png", "image/jpeg", "image/gif"); | |
| //If the protocol is DATA then consider that it is an embedded image | |
| if ("data".equalsIgnoreCase(url.getScheme())) { | |
| resourceIsImage = true; | |
| } else if ("http".equalsIgnoreCase(url.getScheme()) || "https".equalsIgnoreCase(url.getScheme())) { | |
| //If the protocol is not HTTP or HTTPS then consider that it is not a valid image | |
| try (HttpClient client = HttpClient.newHttpClient()) { | |
| //Let 10 seconds to the service to reply and prevent disclosing of the java version | |
| HttpRequest request = HttpRequest.newBuilder() | |
| .uri(url) | |
| .timeout(Duration.ofSeconds(10)) | |
| .header("User-Agent", "ImageChecker") | |
| .GET() | |
| .build(); | |
| HttpResponse<String> response = client.send(request, HttpResponse.BodyHandlers.ofString()); | |
| Optional<String> contentType = response.headers().firstValue("Content-Type"); | |
| if (response.statusCode() == 200 && contentType.isPresent() && acceptedImageContentTypes.contains(contentType.get().toLowerCase(Locale.ROOT))) { | |
| resourceIsImage = true; | |
| } | |
| } catch (Exception e) { | |
| //If any error occur then assume that the target located is not a valid image | |
| resourceIsImage = false; | |
| } | |
| } | |
| return resourceIsImage; | |
| } | |
| public static String sanitizeHTML(String unsafeHTML) { | |
| final List<String> tagsToRemove = List.of("iframe", "frame", "object", "embed", | |
| "video", "audio", "source", "track", "script", "form", "link"); | |
| final String attributeOverrideValue = "danger.png"; | |
| final String attributeNameSelected = "src"; | |
| //Clean document from tags/attributes that we consider dangerous and not needed from our app context | |
| Safelist safelist = Safelist.relaxed(); | |
| //--Remove tags allowing to request non image content | |
| safelist.removeTags(tagsToRemove.toArray(String[]::new)); | |
| //--Only allow HTTP/HTTPS protocols and direct DATA for images | |
| //--Only allow HTTP/HTTPS protocols for hyperlinks | |
| safelist.addProtocols("img", "src", "http", "https", "data") | |
| .addProtocols("a", "href", "http", "https") | |
| .preserveRelativeLinks(true); | |
| String cleanedHTML = Jsoup.clean(unsafeHTML, "http://localhost/", safelist); | |
| Document cleanedDocument = Jsoup.parse(cleanedHTML); | |
| //Select all tags that having an attribute 'src' | |
| Elements elements = cleanedDocument.select(String.format("[%s]", attributeNameSelected)); | |
| //Inspect all locations to look for: | |
| // 1) Path traversal location => Disable path | |
| // 2) Location using // => Disable path | |
| // 3) UNC reference => Disable path | |
| // 4) URL => Check if target is a real valid image using the HTTP/HTTPS protocol | |
| elements.forEach(element -> { | |
| String attributeValue = element.attr(attributeNameSelected).trim(); | |
| //TODO: Handle multiple decoding rounds to prevent bypass | |
| //See method "applyURLDecoding()" on https://github.com/righettod/code-snippets-security-utils | |
| attributeValue = URLDecoder.decode(attributeValue, Charset.defaultCharset()).trim(); | |
| if (attributeValue.contains("..")) { | |
| //Path traversal => Set the attribute to empty target | |
| element.attr(attributeNameSelected, attributeOverrideValue); | |
| } else if (attributeValue.startsWith("//")) { | |
| //Url reference using "//" without the protocols => Set the attribute to empty target | |
| element.attr(attributeNameSelected, attributeOverrideValue); | |
| } else if (attributeValue.startsWith("\\\\")) { | |
| //UNC using "\\" without the protocols => Set the attribute to empty target | |
| element.attr(attributeNameSelected, attributeOverrideValue); | |
| } else if (isValidURL(attributeValue) && !isValidImage(URI.create(attributeValue))) { | |
| //Check URL => Set the attribute to empty target | |
| element.attr(attributeNameSelected, attributeOverrideValue); | |
| } | |
| }); | |
| //Return sanitized HTML | |
| return cleanedDocument.outerHtml(); | |
| } | |
| public static void main(String[] args) throws Exception { | |
| //Load the HTML content in a string | |
| String unsafeHTML = Files.readString(Paths.get("data-in.txt")); | |
| //Sanitize the HTML | |
| String sanitizedHTML = sanitizeHTML(unsafeHTML); | |
| //Save the updated HTML | |
| Path out = Paths.get("data-out.html"); | |
| Files.writeString(out, sanitizedHTML, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING); | |
| } | |
| } |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Note
data-in.txt content was generated by ChatGPT via this user prompt
Give me an sample html content that contains all dangerous tags and expressions when used in a export to PDF. It is to create a unit test of my sanitization method.π File
data-in.txt:π File
data-out.html: