Software Tools in Haskell: getlines

extract lines from stdin by index

Posted on 2016-02-29 by nbloomf
Tags: software-tools-in-haskell, literate-haskell

This page is part of a series on Software Tools in Haskell.

This post is literate Haskell; you can load the source into GHCi and play along.


As usual, we start with some imports.

-- getlines: extract lines from stdin by index
module Main where

import System.Exit (exitSuccess, exitFailure)
import System.Environment (getArgs, getProgName)
import Data.List (isPrefixOf, unfoldr, intercalate)
import System.IO (hPutStrLn, stderr)

This program is not an example from Software Tools; I wrote it to test some functionality that will eventually go into the print program – namely, parsing sets of integers.

getlines does one thing: it takes a set of integers as an argument, and extracts from stdin the lines whose indices (counting from 1) are in the given set. For instance,

getlines "6"

extracts the line at index 6. We can also specify ranges, like

getlines "1-5"

which extracts lines 1, 2, 3, 4, and 5, as well as skip counts, like

getlines "2+3"

which extracts every third line starting with the second (i.e. 2, 5, 8, and so on). We can give several rules separated by commas, and the indices specified will be extracted in order. So

getlines "7-9,1,2"

will extract lines 1, 2, 7, 8, and 9, in that order. We can give more than one integer set argument, and each will be considered in turn with the results concatenated. So

getlines "1,2" "1,2" "1,2"

extracts lines 1, 2, 1, 2, 1, and 2, in that order.

We define a data type for each kind of integer set: single integers, ranges, and skip counts.

data IntSet
  = Single Int
  | Range  Int Int
  | Skip   Int Int
  deriving (Show)

inIntSet :: Int -> [IntSet] -> Bool
inIntSet k ms = or $ map (inIntSet' k) ms
  where
    inIntSet' :: Int -> IntSet -> Bool
    inIntSet' k (Single m)  = k == m
    inIntSet' k (Range a b) = (a <= k) && (k <= b)
    inIntSet' k (Skip a b)  = (k >= a) && ((k-a)`rem`b == 0)

readIntSet :: String -> Maybe (Int -> Bool)
readIntSet xs = do
  cs <- readIntSet' xs
  return (\k -> inIntSet k cs)
  where
    readIntSet' :: String -> Maybe [IntSet]
    readIntSet' = sequence . map oneIntSeq . breakAt ','
      where
        oneIntSeq :: String -> Maybe IntSet
        oneIntSeq "" = Nothing
        oneIntSeq xs = case readDecimalNat xs of
          Just k  -> Just $ Single k
          Nothing -> case map readDecimalNat $ breakAt '-' xs of
            [Just a, Just b] -> Just $ Range a b
            otherwise        -> case map readDecimalNat $ breakAt '+' xs of
              [Just a, Just b] -> Just $ Skip a b
              otherwise        -> Nothing

        breakAt :: (Eq a) => a -> [a] -> [[a]]
        breakAt x = breakBy (== x)
          where
            breakBy :: (a -> Bool) -> [a] -> [[a]]
            breakBy _ [] = [[]]
            breakBy p xs = case break p xs of
              (ys,[])   -> [ys]
              (ys,_:zs) -> ys : breakBy p zs

The helper function readIntSet takes the string representation of a set and returns a function that detects whether a given integer is in the set specified. Compared to representing a set of integers as a set, this makes representing large ranges more efficient and makes representing infinite sets (like skip lists) possible.

Next we write a helper function that extracts items from a list.

getEltsByIndex :: (Int -> Bool) -> [a] -> [a]
getEltsByIndex p xs = map snd $ filter (p . fst) $ zip [1..] xs

We introduced the type of ASA carriage control lines in the overstrike tool. To handle such files here, we need a helper function for reading carriage control formatted lines.

readCCLines :: String -> Maybe [CCLine]
readCCLines = unfoldrMaybe readFirstCCLine . getLines
  where
    readFirstCCLine :: [String] -> Maybe (Maybe (CCLine, [String]))
    readFirstCCLine [] = Just Nothing
    readFirstCCLine ((' ':cs):ds) = do
      let
        (us,vs) = span (isPrefixOf "+") ds

        stripPlus xs = case xs of
          '+':ys    -> Just ys
          otherwise -> Nothing

      case sequence $ map stripPlus us of
        Just ws -> Just (Just (CCLine $ cs:ws, vs))
        Nothing -> Nothing
    readFirstCCLine _ = Nothing

Finally, the main program is simple enough. We take one optional argument, --asacc, which interprets “lines” using the ASA carriage control format.

data Mode = Lines | ASACC

main :: IO ()
main = do
  args <- getArgs

  -- interpret arguments
  (mode,inv,tests) <- do
    let
      (modeflag,rest') = case args of
        ("--asacc":zs) -> (ASACC, zs)
        zs             -> (Lines, zs)

      (notflag,rest) = case rest' of
        ("--not":zs) -> (True,  zs)
        zs           -> (False, zs)

    ps <- case sequence $ map readIntSet rest of
      Just xs -> return xs
      Nothing -> argErr >> exitFailure

    return (modeflag,notflag,ps)

  let
    get xs p = case inv of
      False -> getEltsByIndex p xs
      True  -> getEltsByIndex (not . p) xs

  case mode of
    Lines -> do
      lines <- fmap getLines getContents
      sequence_ $ map putStrLn $ concatMap (get lines) tests
    ASACC -> do
      lines <- fmap readCCLines getContents
      case lines of
        Nothing -> corrErr >> exitFailure
        Just zs -> sequence_ $ map (putStrLn . renderCCLine) $ concatMap (get zs) tests

  exitSuccess


argErr :: IO ()
argErr = reportErrorMsgs
  [ "usage:"
  , "  getlines INTSET ... : extract lines from stdin at indices in RANGE (sorted)"
  , "options:"
  , "  --asacc : read as ASA carriage control lines"
  ]


corrErr :: IO ()
corrErr = reportErrorMsgs
  [ "corrupt input" ]

Old Stuff

data CCLine
  = CCLine [String]
  deriving (Show)


renderCCLine :: CCLine -> String
renderCCLine (CCLine xs)
  = intercalate "\n" $ zipWith (:) (' ' : (repeat '+')) xs


unfoldrMaybe :: (b -> Maybe (Maybe (a,b))) -> b -> Maybe [a]
unfoldrMaybe f x = case f x of
  Nothing -> Nothing
  Just Nothing -> Just []
  Just (Just (a,b)) -> do
    as <- unfoldrMaybe f b
    Just (a:as)


-- write list of messages to stderr
reportErrorMsgs :: [String] -> IO ()
reportErrorMsgs errs = do
  name <- getProgName
  sequence_ $ map (hPutStrLn stderr) $ ((name ++ " error"):errs)


-- parse a natural number base 10
readDecimalNat :: String -> Maybe Int
readDecimalNat xs = do
  ys <- sequence $ map decToInt $ reverse xs
  return $ sum $ zipWith (*) ys [10^t | t <- [0..]]
  where
    decToInt :: Char -> Maybe Int
    decToInt x = lookup x
      [ ('0',0), ('1',1), ('2',2), ('3',3), ('4',4)
      , ('5',5), ('6',6), ('7',7), ('8',8), ('9',9)
      ]


-- split on \n
getLines :: String -> [String]
getLines = unfoldr firstLine
  where
    firstLine :: String -> Maybe (String, String)
    firstLine xs = case break (== '\n') xs of
      ("","")   -> Nothing
      (as,"")   -> Just (as,"")
      (as,b:bs) -> Just (as,bs)