Last active
February 23, 2026 06:43
-
-
Save paulvictor/f45a696d6ad3153140eaf3142539a1dc to your computer and use it in GitHub Desktop.
faster-dedup.hs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env magix | |
| #!magix haskell | |
| #!haskellPackages bytestring vector conduit conduit-extra | |
| #!ghcFlags -O2 | |
| -- #!/usr/bin/env nix-shell | |
| -- #! nix-shell -I nixpkgs=/etc/nix/inputs/nixpkgs | |
| -- #! nix-shell -p "haskell.packages.ghc910.ghcWithPackages (pkgs: with pkgs; [ conduit conduit-extra ]) " ghcid | |
| -- #! nix-shell -i ghcid | |
| {-# LANGUAGE OverloadedStrings #-} | |
| import Data.ByteString qualified as BS | |
| import Data.ByteString (ByteString) | |
| import qualified Data.Conduit.List as C | |
| import qualified Data.Conduit.Combinators as C | |
| import Data.Conduit | |
| import Data.Conduit.Zlib | |
| import Conduit | |
| import Debug.Trace | |
| offsetAndLimit :: [(Int, Int)] | |
| offsetAndLimit = (\i -> (i*100, 100)) <$> [0..520000] | |
| main :: IO () | |
| main = | |
| runConduitRes $ | |
| C.sourceFile "./etl.gz" | |
| .| ungzip | |
| .| selectedBytes (limitOffsetToActions offsetAndLimit) | |
| .| C.stdout | |
| data Action = Take Int | Drop Int deriving Show | |
| limitOffsetToActions :: [(Int, Int)] -> [Action] | |
| limitOffsetToActions xs = | |
| concat $ | |
| zipWith | |
| (\(l1, o1) (l2, o2) -> [Drop (l2-(o1+l1)), Take o2]) | |
| ((0,0):xs) | |
| xs | |
| selectedBytes :: MonadResource m => [Action] -> ConduitT ByteString ByteString m () | |
| selectedBytes = | |
| fetchAndProcessBytes | |
| where | |
| fetchAndProcessBytes actions = | |
| awaitForever (\bs -> processBytes bs actions) | |
| processBytes _ [] = return () -- No more actions | |
| processBytes bs (x:xs) = | |
| let | |
| l = BS.length bs | |
| in | |
| case x of | |
| Drop i -> do | |
| if l > i | |
| then -- We have more bytes than was necessary | |
| processBytes (BS.drop i bs) xs | |
| else | |
| if l == i | |
| then | |
| fetchAndProcessBytes xs | |
| else | |
| -- We have not dropped enough, loop and get the next set of bytes | |
| fetchAndProcessBytes (Drop (i-l) : xs) | |
| Take i -> do | |
| if l > i | |
| then do -- We have more bytes than was necessary | |
| let | |
| (toYield, toDrop) = BS.splitAt i bs | |
| yield toYield | |
| processBytes toDrop xs | |
| else do | |
| yield bs | |
| if l == i | |
| then fetchAndProcessBytes xs | |
| else fetchAndProcessBytes (Take (i-l) : xs) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment