Skip to content

Instantly share code, notes, and snippets.

@righettod
Last active January 23, 2026 07:10
Show Gist options
  • Select an option

  • Save righettod/c9d363f6ca0df348dfd47370972db870 to your computer and use it in GitHub Desktop.

Select an option

Save righettod/c9d363f6ca0df348dfd47370972db870 to your computer and use it in GitHub Desktop.
Sample code trying to sanitize an HTML content intended to be passed to an "HTML to PDF" exporter.
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.safety.Safelist;
import org.jsoup.select.Elements;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URLDecoder;
import java.net.http.HttpClient;
import java.net.http.HttpRequest;
import java.net.http.HttpResponse;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardOpenOption;
import java.time.Duration;
import java.util.List;
import java.util.Locale;
import java.util.Optional;
/**
* Sample code trying to sanitize an HTML content intended to be passed to an "HTML to PDF" exporter.<br>
* JSoup library is used to parse the HTML.<br>
* The base assumption is that loading of an external image must be allowed, for which, domains cannot be constrainted.
*
* @see "https://jsoup.org/"
* @see "https://mvnrepository.com/artifact/org.jsoup/jsoup"
*/
public class HtmlToPdfSanitizer {
private static boolean isValidURL(String url) {
boolean stringIsAnURL = false;
try {
URI u = new URI(url);
stringIsAnURL = (u.getScheme() != null && !u.getScheme().trim().isEmpty());
} catch (URISyntaxException e) {
stringIsAnURL = false;
}
return stringIsAnURL;
}
private static boolean isValidImage(URI url) {
//By default consider that is not a valid image
boolean resourceIsImage = false;
List<String> acceptedImageContentTypes = List.of("image/png", "image/jpeg", "image/gif");
//If the protocol is DATA then consider that it is an embedded image
if ("data".equalsIgnoreCase(url.getScheme())) {
resourceIsImage = true;
} else if ("http".equalsIgnoreCase(url.getScheme()) || "https".equalsIgnoreCase(url.getScheme())) {
//If the protocol is not HTTP or HTTPS then consider that it is not a valid image
try (HttpClient client = HttpClient.newHttpClient()) {
//Let 10 seconds to the service to reply and prevent disclosing of the java version
HttpRequest request = HttpRequest.newBuilder()
.uri(url)
.timeout(Duration.ofSeconds(10))
.header("User-Agent", "ImageChecker")
.GET()
.build();
HttpResponse<String> response = client.send(request, HttpResponse.BodyHandlers.ofString());
Optional<String> contentType = response.headers().firstValue("Content-Type");
if (response.statusCode() == 200 && contentType.isPresent() && acceptedImageContentTypes.contains(contentType.get().toLowerCase(Locale.ROOT))) {
resourceIsImage = true;
}
} catch (Exception e) {
//If any error occur then assume that the target located is not a valid image
resourceIsImage = false;
}
}
return resourceIsImage;
}
public static String sanitizeHTML(String unsafeHTML) {
final List<String> tagsToRemove = List.of("iframe", "frame", "object", "embed",
"video", "audio", "source", "track", "script", "form", "link");
final String attributeOverrideValue = "danger.png";
final String attributeNameSelected = "src";
//Clean document from tags/attributes that we consider dangerous and not needed from our app context
Safelist safelist = Safelist.relaxed();
//--Remove tags allowing to request non image content
safelist.removeTags(tagsToRemove.toArray(String[]::new));
//--Only allow HTTP/HTTPS protocols and direct DATA for images
//--Only allow HTTP/HTTPS protocols for hyperlinks
safelist.addProtocols("img", "src", "http", "https", "data")
.addProtocols("a", "href", "http", "https")
.preserveRelativeLinks(true);
String cleanedHTML = Jsoup.clean(unsafeHTML, "http://localhost/", safelist);
Document cleanedDocument = Jsoup.parse(cleanedHTML);
//Select all tags that having an attribute 'src'
Elements elements = cleanedDocument.select(String.format("[%s]", attributeNameSelected));
//Inspect all locations to look for:
// 1) Path traversal location => Disable path
// 2) Location using // => Disable path
// 3) UNC reference => Disable path
// 4) URL => Check if target is a real valid image using the HTTP/HTTPS protocol
elements.forEach(element -> {
String attributeValue = element.attr(attributeNameSelected).trim();
//TODO: Handle multiple decoding rounds to prevent bypass
//See method "applyURLDecoding()" on https://github.com/righettod/code-snippets-security-utils
attributeValue = URLDecoder.decode(attributeValue, Charset.defaultCharset()).trim();
if (attributeValue.contains("..")) {
//Path traversal => Set the attribute to empty target
element.attr(attributeNameSelected, attributeOverrideValue);
} else if (attributeValue.startsWith("//")) {
//Url reference using "//" without the protocols => Set the attribute to empty target
element.attr(attributeNameSelected, attributeOverrideValue);
} else if (attributeValue.startsWith("\\\\")) {
//UNC using "\\" without the protocols => Set the attribute to empty target
element.attr(attributeNameSelected, attributeOverrideValue);
} else if (isValidURL(attributeValue) && !isValidImage(URI.create(attributeValue))) {
//Check URL => Set the attribute to empty target
element.attr(attributeNameSelected, attributeOverrideValue);
}
});
//Return sanitized HTML
return cleanedDocument.outerHtml();
}
public static void main(String[] args) throws Exception {
//Load the HTML content in a string
String unsafeHTML = Files.readString(Paths.get("data-in.txt"));
//Sanitize the HTML
String sanitizedHTML = sanitizeHTML(unsafeHTML);
//Save the updated HTML
Path out = Paths.get("data-out.html");
Files.writeString(out, sanitizedHTML, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING);
}
}
@righettod
Copy link
Author

righettod commented Jan 23, 2026

Note

data-in.txt content was generated by ChatGPT via this user prompt Give me an sample html content that contains all dangerous tags and expressions when used in a export to PDF. It is to create a unit test of my sanitization method.

🐞 File data-in.txt:

<!DOCTYPE html>
<html>
<head>
    <meta charset="UTF-8">

    <!-- ❌ External & local CSS -->
    <link rel="stylesheet" href="http://evil.example.com/style.css">
    <link rel="stylesheet" href="file:///etc/passwd">

    <!-- ❌ CSS file access via url() -->
    <style>
        body {
            background-image: url("file:///etc/shadow");
        }
    </style>

    <!-- ❌ JavaScript -->
    <script src="http://evil.example.com/evil.js"></script>
    <script>alert('XSS');</script>
</head>

<body>

<h1>HTML β†’ PDF Sanitization Test</h1>

<!-- βœ… VALID local image (should be kept) -->
<img src="images/logo.png" alt="Valid local image">

<!-- ❌ IFRAME -->
<iframe src="http://evil.example.com/page.html"></iframe>
<iframe src="file:///etc/passwd"></iframe>

<!-- ❌ OBJECT / EMBED -->
<object data="file:///etc/passwd" type="text/plain"></object>
<embed src="http://evil.example.com/evil.swf">

<!-- ❌ IMAGE abuse -->
<img src="file:///etc/passwd">
<img src="../etc/passwd">
<img src="jar:file:/app.jar!/secret.png">

<!-- βœ… Base64 image (allowed if data: enabled) -->
<img
  src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/x8AAwMCAO+X2LsAAAAASUVORK5CYII="
  alt="Base64 image"
/>

<!-- ❌ SVG external reference -->
<svg width="100" height="100">
    <use href="file:///etc/passwd"></use>
</svg>

<!-- ❌ Links -->
<a href="http://evil.example.com">External link</a>
<a href="file:///etc/passwd">Local file link</a>
<a href="javascript:alert('XSS')">JS link</a>

<!-- ❌ Form -->
<form action="file:///etc/passwd" method="get">
    <input type="submit" value="Submit">
</form>

<!-- ❌ Meta refresh -->
<meta http-equiv="refresh" content="0;url=file:///etc/passwd">

<!-- ❌ Base tag -->
<base href="file:///">

<!-- ❌ Event handlers -->
<img src="images/ok.png" onerror="alert('XSS')">
<div onclick="alert('XSS')">Click me</div>

<!-- ❌ Hidden tricks -->
<!-- <img src="file:///etc/passwd"> -->
<div data-url="file:///etc/passwd"></div>

</body>
</html>

πŸ“ File data-out.html:

<html>
 <head></head>
 <body>
  <h1>HTML β†’ PDF Sanitization Test</h1>
  <img src="images/logo.png" alt="Valid local image"> <img> <img src="danger.png"> <img> <img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/x8AAwMCAO+X2LsAAAAASUVORK5CYII=" alt="Base64 image"> <a href="http://evil.example.com">External link</a> <a>Local file link</a> <a>JS link</a> <img src="images/ok.png">
  <div>Click me</div>
  <div></div>
 </body>
</html>

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment