Skip to content

Instantly share code, notes, and snippets.

@paulvictor
Last active February 23, 2026 06:43
Show Gist options
  • Select an option

  • Save paulvictor/f45a696d6ad3153140eaf3142539a1dc to your computer and use it in GitHub Desktop.

Select an option

Save paulvictor/f45a696d6ad3153140eaf3142539a1dc to your computer and use it in GitHub Desktop.
faster-dedup.hs
#!/usr/bin/env magix
#!magix haskell
#!haskellPackages bytestring vector conduit conduit-extra
#!ghcFlags -O2
-- #!/usr/bin/env nix-shell
-- #! nix-shell -I nixpkgs=/etc/nix/inputs/nixpkgs
-- #! nix-shell -p "haskell.packages.ghc910.ghcWithPackages (pkgs: with pkgs; [ conduit conduit-extra ]) " ghcid
-- #! nix-shell -i ghcid
{-# LANGUAGE OverloadedStrings #-}
import Data.ByteString qualified as BS
import Data.ByteString (ByteString)
import qualified Data.Conduit.List as C
import qualified Data.Conduit.Combinators as C
import Data.Conduit
import Data.Conduit.Zlib
import Conduit
import Debug.Trace
offsetAndLimit :: [(Int, Int)]
offsetAndLimit = (\i -> (i*100, 100)) <$> [0..520000]
main :: IO ()
main =
runConduitRes $
C.sourceFile "./etl.gz"
.| ungzip
.| selectedBytes (limitOffsetToActions offsetAndLimit)
.| C.stdout
data Action = Take Int | Drop Int deriving Show
limitOffsetToActions :: [(Int, Int)] -> [Action]
limitOffsetToActions xs =
concat $
zipWith
(\(l1, o1) (l2, o2) -> [Drop (l2-(o1+l1)), Take o2])
((0,0):xs)
xs
selectedBytes :: MonadResource m => [Action] -> ConduitT ByteString ByteString m ()
selectedBytes =
fetchAndProcessBytes
where
fetchAndProcessBytes actions =
awaitForever (\bs -> processBytes bs actions)
processBytes _ [] = return () -- No more actions
processBytes bs (x:xs) =
let
l = BS.length bs
in
case x of
Drop i -> do
if l > i
then -- We have more bytes than was necessary
processBytes (BS.drop i bs) xs
else
if l == i
then
fetchAndProcessBytes xs
else
-- We have not dropped enough, loop and get the next set of bytes
fetchAndProcessBytes (Drop (i-l) : xs)
Take i -> do
if l > i
then do -- We have more bytes than was necessary
let
(toYield, toDrop) = BS.splitAt i bs
yield toYield
processBytes toDrop xs
else do
yield bs
if l == i
then fetchAndProcessBytes xs
else fetchAndProcessBytes (Take (i-l) : xs)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment