wohnugs-suche/berlin-scraper/app/Main.hs

249 lines
8.1 KiB
Haskell
Raw Normal View History

2023-10-24 22:15:11 +00:00
{-# LANGUAGE BlockArguments #-}
{-# LANGUAGE DataKinds #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE OverloadedRecordDot #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RecordWildCards #-}
module Main where
2023-11-08 08:55:48 +00:00
import Control.Lens (view, (&), (.~))
2023-10-24 22:15:11 +00:00
import Control.Monad (when)
import Control.Monad.IO.Class
import Data.Aeson hiding (Options)
import Data.Aeson qualified as Aeson
import Data.ByteString.Lazy qualified as LBS
import Data.Maybe (fromJust, fromMaybe, listToMaybe)
import Data.Text qualified as Text
import Data.Text.Encoding qualified as Text
import Data.Text.IO qualified as Text
import Database.SQLite.Simple
import GHC.Generics
2023-11-08 08:55:48 +00:00
import Network.HTTP.Client.OpenSSL
2023-10-24 22:15:11 +00:00
import Network.Wreq hiding (Options)
import Options.Generic
2023-11-08 08:55:48 +00:00
import Servant.Client (mkClientEnv)
2023-10-24 22:15:11 +00:00
import System.Exit (exitFailure)
import Telegram.Bot.API
import Telegram.Bot.Simple
import Telegram.Bot.Simple.Debug
import Text.HTML.Scalpel
2023-11-08 08:55:48 +00:00
import qualified Network.Wreq as Wreq
2023-10-24 22:15:11 +00:00
main :: IO ()
2023-11-08 08:55:48 +00:00
main = withOpenSSL $ do
2023-10-24 22:15:11 +00:00
opts <- unwrapRecord "Berlin Scrapper"
token <- getEnvToken "TELEGRAM_BOT_TOKEN"
2023-11-08 08:55:48 +00:00
httpMgr <- newOpenSSLManager
let clientEnv = mkClientEnv httpMgr (botBaseUrl token)
wreqOpts = Wreq.defaults & Wreq.manager .~ Right httpMgr
startBot (traceBotDefault $ botApp opts wreqOpts) clientEnv >>= \case
2023-10-24 22:15:11 +00:00
Left err -> do
putStrLn $ "Bot failed with: " <> show err
exitFailure
_ -> pure ()
-- * Types
newtype Options w = Options {dbFile :: w ::: FilePath <?> "Path to the sqlite database"}
deriving (Generic)
instance ParseRecord (Options Wrapped)
data Offer = Offer
{ id_ :: Text,
title :: Text,
address :: Maybe Text,
rooms :: Maybe Double,
area :: Maybe Double,
availableFrom :: Maybe Text,
link :: Text
}
deriving (Show, Generic)
instance FromRow Offer
instance ToRow Offer
data IBWResponse = IBWResponse
{ headline :: Text,
searchresults :: Text
}
deriving (Generic)
instance FromJSON IBWResponse
-- * Parsing
2023-11-08 08:55:48 +00:00
queryIBW :: Wreq.Options -> IO IBWResponse
queryIBW wreqOpts = do
2023-10-24 22:15:11 +00:00
let reqBody =
[ partText "q" "wf-save-srch",
partText "save" "false",
partText "heizung_zentral" "false",
partText "heizung_etage" "false",
partText "energy_fernwaerme" "false",
partText "heizung_nachtstrom" "false",
partText "heizung_ofen" "false",
partText "heizung_gas" "false",
partText "heizung_oel" "false",
partText "heizung_solar" "false",
partText "heizung_erdwaerme" "false",
partText "heizung_fussboden" "false",
partText "seniorenwohnung" "false",
partText "maisonette" "false",
partText "etagen_dg" "false",
partText "balkon_loggia_terrasse" "false",
partText "garten" "false",
partText "wbs" "0",
partText "barrierefrei" "false",
partText "gaeste_wc" "false",
partText "aufzug" "false",
partText "stellplatz" "false",
partText "keller" "false",
partText "badewanne" "false",
partText "dusche" "false",
partText "bez[]" "01_00",
partText "bez[]" "02_00",
partText "bez[]" "04_00",
partText "bez[]" "07_00",
partText "bez[]" "08_00",
partText "bez[]" "09_00",
partText "bez[]" "11_00"
]
let link = "https://inberlinwohnen.de/wp-content/themes/ibw/skript/wohnungsfinder.php"
2023-11-08 08:55:48 +00:00
resBS <- view responseBody <$> postWith wreqOpts link reqBody
2023-10-24 22:15:11 +00:00
pure $ fromJust $ decode @IBWResponse resBS
scrapeOffers :: Scraper Text [Offer]
scrapeOffers = do
let listItemsSelector = "div" @: [hasClass "result-list"] // "ul" // "li"
chroots listItemsSelector scrapeOffer
scrapeOffer :: Scraper Text Offer
scrapeOffer = do
title <- Text.strip <$> text "h3"
let tableSelector = "div" @: [hasClass "tb-merkdetails"] // "div" @: [hasClass "span_wflist_data"] // "table" @: [hasClass "fullw"] // "tbody"
rowsSelector = tableSelector // "tr"
tableDataParser = do
thVal <- text "th"
(Text.strip thVal,) <$> text "td"
allTableData <- chroots rowsSelector tableDataParser
let address = lookup "Adresse:" allTableData
roomsStr = lookup "Zimmeranzahl:" allTableData
rooms = readEuropeanNumber <$> roomsStr
areaStr = lookup "Wohnfläche:" allTableData
area = readEuropeanNumber <$> (Text.stripSuffix "" =<< areaStr)
availableFrom = lookup "Bezugsfertig ab:" allTableData
link <- attr "href" $ "a" @: [hasClass "org-but"]
id_ <- attr "id" "li"
pure Offer {..}
readEuropeanNumber :: Text -> Double
readEuropeanNumber x = do
read $ Text.unpack $ Text.replace "," "." x
-- * SQLite
insertOffer :: Connection -> Offer -> IO ()
insertOffer conn =
execute conn "INSERT INTO offers (id, title, address, rooms, area, availableFrom, link) VALUES (?, ?, ?, ?, ?, ?, ?)"
getOffer :: Connection -> Text -> IO (Maybe Offer)
getOffer conn offerId =
listToMaybe <$> query conn "SELECT * from offers where id = ?" (Only offerId)
createTable :: Connection -> IO ()
createTable conn =
execute_ conn "CREATE TABLE IF NOT EXISTS offers (id TEXT PRIMARY KEY, title TEXT, address TEXT, rooms REAL, area REAL, availableFrom TEXT, link TEXT)"
saveOffer :: Connection -> Offer -> IO Bool
saveOffer conn offer = do
getOffer conn offer.id_ >>= \case
Nothing -> do
insertOffer conn offer
pure True
_ -> pure False
-- * Telegram
type Model = ()
newtype Action = StartChat Chat
deriving (Show)
2023-11-08 08:55:48 +00:00
botApp :: Options Unwrapped -> Wreq.Options -> BotApp Model Action
botApp opts wreqOpts =
2023-10-24 22:15:11 +00:00
BotApp
{ botInitialModel = (),
botAction = action,
botHandler = handler,
2023-11-08 08:55:48 +00:00
botJobs = [scrapeJob opts wreqOpts]
2023-10-24 22:15:11 +00:00
}
action :: Update -> Model -> Maybe Action
action update _ = do
msg <- update.updateMessage
txt <- msg.messageText
if txt == "/start"
then pure $ StartChat msg.messageChat
else Nothing
handler :: Action -> Model -> Eff Action Model
handler (StartChat chat) model =
model <# do
liftIO $ putStrLn $ "Chat started! " <> ppAsJSON chat
2023-11-08 08:55:48 +00:00
scrapeJob :: Options Unwrapped -> Wreq.Options -> BotJob Model Action
scrapeJob opts wreqOpts =
2023-10-24 22:15:11 +00:00
BotJob
{ botJobSchedule = "* * * * *",
2023-11-08 08:55:48 +00:00
botJobTask = scrapeJobTask opts wreqOpts
2023-10-24 22:15:11 +00:00
}
2023-11-08 08:55:48 +00:00
scrapeJobTask :: Options Unwrapped -> Wreq.Options -> Model -> Eff Action Model
scrapeJobTask opts wreqOpts m =
2023-10-24 22:15:11 +00:00
m <# do
liftIO $ putStrLn "Starting scrape job"
2023-11-08 08:55:48 +00:00
res <- liftIO $ queryIBW wreqOpts
2023-10-24 22:15:11 +00:00
let offers = fromJust $ scrapeStringLike res.searchresults scrapeOffers
liftIO $ putStrLn "Fetched offers"
conn <- liftIO $ open (dbFile opts)
liftIO $ createTable conn
mapM_
( \offer -> do
isNewOffer <- liftIO $ saveOffer conn offer
when isNewOffer $ do
liftIO $ putStrLn "Found a new offer"
notify offer
)
offers
notify :: Offer -> BotM ()
notify offer = do
let offerTitle = "<b><u>" <> offer.title <> "</u></b>"
offerAddress = "<b>Address:</b> " <> fromMaybe "N/A" offer.address
offerRooms = "<b>Rooms:</b> " <> maybe "N/A" (Text.pack . show) offer.rooms
offerArea = "<b>Area:</b> " <> maybe "N/A" ((<> "") . Text.pack . show) offer.area
offerLink = "<a href=\"https://inberlinwohnen.de" <> offer.link <> "\" >Apply Here</a>"
offerBody = offerAddress <> "\n" <> offerRooms <> "\n" <> offerArea <> "\n" <> offerLink
offerText = offerTitle <> "\n\n" <> offerBody
2023-10-27 07:20:51 +00:00
sendMsgReq1 = (defSendMessage (SomeChatId $ ChatId 952512153) offerText) {sendMessageParseMode = Just HTML}
sendMsgReq2 = (defSendMessage (SomeChatId $ ChatId 116981707) offerText) {sendMessageParseMode = Just HTML}
res1 <- runTG sendMsgReq1
2023-10-24 22:15:11 +00:00
liftIO $
2023-10-27 07:20:51 +00:00
if res1.responseOk
2023-10-24 22:15:11 +00:00
then putStrLn "Notified successfully"
else do
putStrLn $ "Failed to notify the offer: " <> show offer
2023-10-27 07:20:51 +00:00
Text.putStrLn $ "Response: " <> Text.decodeUtf8 (LBS.toStrict $ Aeson.encode res1)
res2 <- runTG sendMsgReq2
liftIO $
if res2.responseOk
then putStrLn "Notified successfully"
else do
putStrLn $ "Failed to notify the offer: " <> show offer
Text.putStrLn $ "Response: " <> Text.decodeUtf8 (LBS.toStrict $ Aeson.encode res2)