righettod/HtmlToPdfSanitizer.java

## HtmlToPdfSanitizer.java
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.safety.Safelist;
import org.jsoup.select.Elements;

import java.net.URI;
import java.net.URISyntaxException;
import java.net.URLDecoder;
import java.net.http.HttpClient;
import java.net.http.HttpRequest;
import java.net.http.HttpResponse;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardOpenOption;
import java.time.Duration;
import java.util.List;
import java.util.Locale;
import java.util.Optional;

/**
 * Sample code trying to sanitize an HTML content intended to be passed to an "HTML to PDF" exporter.<br>
 * JSoup library is used to parse the HTML.<br>
 * The base assumption is that loading of an external image must be allowed, for which, domains cannot be constrainted.
 *
 * @see "https://jsoup.org/"
 * @see "https://mvnrepository.com/artifact/org.jsoup/jsoup"
 */
public class HtmlToPdfSanitizer {

    private static boolean isValidURL(String url) {
        boolean stringIsAnURL = false;
        try {
            URI u = new URI(url);
            stringIsAnURL = (u.getScheme() != null && !u.getScheme().trim().isEmpty());
        } catch (URISyntaxException e) {
            stringIsAnURL = false;
        }
        return stringIsAnURL;
    }

    private static boolean isValidImage(URI url) {
        //By default consider that is not a valid image
        boolean resourceIsImage = false;
        List<String> acceptedImageContentTypes = List.of("image/png", "image/jpeg", "image/gif");
        //If the protocol is DATA then consider that it is an embedded image
        if ("data".equalsIgnoreCase(url.getScheme())) {
            resourceIsImage = true;
        } else if ("http".equalsIgnoreCase(url.getScheme()) || "https".equalsIgnoreCase(url.getScheme())) {
            //If the protocol is not HTTP or HTTPS then consider that it is not a valid image
            try (HttpClient client = HttpClient.newHttpClient()) {
                //Let 10 seconds to the service to reply and prevent disclosing of the java version
                HttpRequest request = HttpRequest.newBuilder()
                        .uri(url)
                        .timeout(Duration.ofSeconds(10))
                        .header("User-Agent", "ImageChecker")
                        .GET()
                        .build();
                HttpResponse<String> response = client.send(request, HttpResponse.BodyHandlers.ofString());
                Optional<String> contentType = response.headers().firstValue("Content-Type");
                if (response.statusCode() == 200 && contentType.isPresent() && acceptedImageContentTypes.contains(contentType.get().toLowerCase(Locale.ROOT))) {
                    resourceIsImage = true;
                }
            } catch (Exception e) {
                //If any error occur then assume that the target located is not a valid image
                resourceIsImage = false;
            }
        }
        return resourceIsImage;
    }

    public static String sanitizeHTML(String unsafeHTML) {
        final List<String> tagsToRemove = List.of("iframe", "frame", "object", "embed",
                "video", "audio", "source", "track", "script", "form", "link");
        final String attributeOverrideValue = "danger.png";
        final String attributeNameSelected = "src";
        //Clean document from tags/attributes that we consider dangerous and not needed from our app context
        Safelist safelist = Safelist.relaxed();
        //--Remove tags allowing to request non image content
        safelist.removeTags(tagsToRemove.toArray(String[]::new));
        //--Only allow HTTP/HTTPS protocols and direct DATA for images
        //--Only allow HTTP/HTTPS protocols for hyperlinks
        safelist.addProtocols("img", "src", "http", "https", "data")
                .addProtocols("a", "href", "http", "https")
                .preserveRelativeLinks(true);
        String cleanedHTML = Jsoup.clean(unsafeHTML, "http://localhost/", safelist);
        Document cleanedDocument = Jsoup.parse(cleanedHTML);
        //Select all tags that having an attribute 'src'
        Elements elements = cleanedDocument.select(String.format("[%s]", attributeNameSelected));
        //Inspect all locations to look for:
        // 1) Path traversal location => Disable path
        // 2) Location using // => Disable path
        // 3) UNC reference => Disable path
        // 4) URL => Check if target is a real valid image using the HTTP/HTTPS protocol
        elements.forEach(element -> {
            String attributeValue = element.attr(attributeNameSelected).trim();
            //TODO: Handle multiple decoding rounds to prevent bypass
            //See method "applyURLDecoding()" on https://github.com/righettod/code-snippets-security-utils
            attributeValue = URLDecoder.decode(attributeValue, Charset.defaultCharset()).trim();
            if (attributeValue.contains("..")) {
                //Path traversal => Set the attribute to empty target
                element.attr(attributeNameSelected, attributeOverrideValue);
            } else if (attributeValue.startsWith("//")) {
                //Url reference using "//" without the protocols => Set the attribute to empty target
                element.attr(attributeNameSelected, attributeOverrideValue);
            } else if (attributeValue.startsWith("\\\\")) {
                //UNC using "\\" without the protocols => Set the attribute to empty target
                element.attr(attributeNameSelected, attributeOverrideValue);
            } else if (isValidURL(attributeValue) && !isValidImage(URI.create(attributeValue))) {
                //Check URL => Set the attribute to empty target
                element.attr(attributeNameSelected, attributeOverrideValue);
            }
        });
        //Return sanitized HTML
        return cleanedDocument.outerHtml();
    }

    public static void main(String[] args) throws Exception {
        //Load the HTML content in a string
        String unsafeHTML = Files.readString(Paths.get("data-in.txt"));
        //Sanitize the HTML
        String sanitizedHTML = sanitizeHTML(unsafeHTML);
        //Save the updated HTML
        Path out = Paths.get("data-out.html");
        Files.writeString(out, sanitizedHTML, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING);
    }
}
	import org.jsoup.Jsoup;
	import org.jsoup.nodes.Document;
	import org.jsoup.safety.Safelist;
	import org.jsoup.select.Elements;

	import java.net.URI;
	import java.net.URISyntaxException;
	import java.net.URLDecoder;
	import java.net.http.HttpClient;
	import java.net.http.HttpRequest;
	import java.net.http.HttpResponse;
	import java.nio.charset.Charset;
	import java.nio.file.Files;
	import java.nio.file.Path;
	import java.nio.file.Paths;
	import java.nio.file.StandardOpenOption;
	import java.time.Duration;
	import java.util.List;
	import java.util.Locale;
	import java.util.Optional;

	/**
	* Sample code trying to sanitize an HTML content intended to be passed to an "HTML to PDF" exporter.<br>
	* JSoup library is used to parse the HTML.<br>
	* The base assumption is that loading of an external image must be allowed, for which, domains cannot be constrainted.
	*
	* @see "https://jsoup.org/"
	* @see "https://mvnrepository.com/artifact/org.jsoup/jsoup"
	*/
	public class HtmlToPdfSanitizer {

	private static boolean isValidURL(String url) {
	boolean stringIsAnURL = false;
	try {
	URI u = new URI(url);
	stringIsAnURL = (u.getScheme() != null && !u.getScheme().trim().isEmpty());
	} catch (URISyntaxException e) {
	stringIsAnURL = false;
	}
	return stringIsAnURL;
	}

	private static boolean isValidImage(URI url) {
	//By default consider that is not a valid image
	boolean resourceIsImage = false;
	List<String> acceptedImageContentTypes = List.of("image/png", "image/jpeg", "image/gif");
	//If the protocol is DATA then consider that it is an embedded image
	if ("data".equalsIgnoreCase(url.getScheme())) {
	resourceIsImage = true;
	} else if ("http".equalsIgnoreCase(url.getScheme()) \|\| "https".equalsIgnoreCase(url.getScheme())) {
	//If the protocol is not HTTP or HTTPS then consider that it is not a valid image
	try (HttpClient client = HttpClient.newHttpClient()) {
	//Let 10 seconds to the service to reply and prevent disclosing of the java version
	HttpRequest request = HttpRequest.newBuilder()
	.uri(url)
	.timeout(Duration.ofSeconds(10))
	.header("User-Agent", "ImageChecker")
	.GET()
	.build();
	HttpResponse<String> response = client.send(request, HttpResponse.BodyHandlers.ofString());
	Optional<String> contentType = response.headers().firstValue("Content-Type");
	if (response.statusCode() == 200 && contentType.isPresent() && acceptedImageContentTypes.contains(contentType.get().toLowerCase(Locale.ROOT))) {
	resourceIsImage = true;
	}
	} catch (Exception e) {
	//If any error occur then assume that the target located is not a valid image
	resourceIsImage = false;
	}
	}
	return resourceIsImage;
	}

	public static String sanitizeHTML(String unsafeHTML) {
	final List<String> tagsToRemove = List.of("iframe", "frame", "object", "embed",
	"video", "audio", "source", "track", "script", "form", "link");
	final String attributeOverrideValue = "danger.png";
	final String attributeNameSelected = "src";
	//Clean document from tags/attributes that we consider dangerous and not needed from our app context
	Safelist safelist = Safelist.relaxed();
	//--Remove tags allowing to request non image content
	safelist.removeTags(tagsToRemove.toArray(String[]::new));
	//--Only allow HTTP/HTTPS protocols and direct DATA for images
	//--Only allow HTTP/HTTPS protocols for hyperlinks
	safelist.addProtocols("img", "src", "http", "https", "data")
	.addProtocols("a", "href", "http", "https")
	.preserveRelativeLinks(true);
	String cleanedHTML = Jsoup.clean(unsafeHTML, "http://localhost/", safelist);
	Document cleanedDocument = Jsoup.parse(cleanedHTML);
	//Select all tags that having an attribute 'src'
	Elements elements = cleanedDocument.select(String.format("[%s]", attributeNameSelected));
	//Inspect all locations to look for:
	// 1) Path traversal location => Disable path
	// 2) Location using // => Disable path
	// 3) UNC reference => Disable path
	// 4) URL => Check if target is a real valid image using the HTTP/HTTPS protocol
	elements.forEach(element -> {
	String attributeValue = element.attr(attributeNameSelected).trim();
	//TODO: Handle multiple decoding rounds to prevent bypass
	//See method "applyURLDecoding()" on https://github.com/righettod/code-snippets-security-utils
	attributeValue = URLDecoder.decode(attributeValue, Charset.defaultCharset()).trim();
	if (attributeValue.contains("..")) {
	//Path traversal => Set the attribute to empty target
	element.attr(attributeNameSelected, attributeOverrideValue);
	} else if (attributeValue.startsWith("//")) {
	//Url reference using "//" without the protocols => Set the attribute to empty target
	element.attr(attributeNameSelected, attributeOverrideValue);
	} else if (attributeValue.startsWith("\\\\")) {
	//UNC using "\\" without the protocols => Set the attribute to empty target
	element.attr(attributeNameSelected, attributeOverrideValue);
	} else if (isValidURL(attributeValue) && !isValidImage(URI.create(attributeValue))) {
	//Check URL => Set the attribute to empty target
	element.attr(attributeNameSelected, attributeOverrideValue);
	}
	});
	//Return sanitized HTML
	return cleanedDocument.outerHtml();
	}

	public static void main(String[] args) throws Exception {
	//Load the HTML content in a string
	String unsafeHTML = Files.readString(Paths.get("data-in.txt"));
	//Sanitize the HTML
	String sanitizedHTML = sanitizeHTML(unsafeHTML);
	//Save the updated HTML
	Path out = Paths.get("data-out.html");
	Files.writeString(out, sanitizedHTML, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING);
	}
	}
No results found