Last active
October 7, 2021 19:00
-
-
Save punit-naik/f4ee8761013fd42bbdd5d26ba47b5afd to your computer and use it in GitHub Desktop.
Provides functions to extract content from a doc using Tika parsers in Clojure
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ;; NOTE: Please add [org.apache.tika/tika-parsers "1.25] and [org.apache.tika/tika-core "1.25] in your dependencies | |
| (import '[org.apache.tika.parser AutoDetectParser]) | |
| (import '[org.apache.tika.metadata Metadata]) | |
| (import '[org.apache.tika.sax BodyContentHandler]) | |
| (import '[java.nio.file Files]) | |
| (import '[java.io ByteArrayInputStream ByteArrayOutputStream]) | |
| (require '[clojure.java.io :as io]) | |
| (defn parse-doc | |
| "Parses resume content into a string when a file's byte array is passed" | |
| [file-byte-array] | |
| (let [bais (ByteArrayInputStream. file-byte-array) | |
| baos (ByteArrayOutputStream.)] | |
| (-> (AutoDetectParser.) | |
| (.parse bais | |
| (BodyContentHandler. baos) | |
| (Metadata.))) | |
| (let [parsed-output (.toString baos "UTF-8")] | |
| (.close baos) | |
| (.close bais) | |
| parsed-output))) | |
| (def file (io/file "path/to/file")) | |
| (def file-bytes (Files/readAllBytes (.toPath file))) | |
| (def parsed-file (parse-doc file-bytes)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment