From 36312daf87b5bd6081768675845c3c59fa06228d Mon Sep 17 00:00:00 2001 From: puckoprutt Date: Sun, 23 Feb 2025 05:27:05 +0100 Subject: [PATCH] created a varda spider --- dagens_lunch/spiders/varda.py | 87 +++++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) create mode 100644 dagens_lunch/spiders/varda.py diff --git a/dagens_lunch/spiders/varda.py b/dagens_lunch/spiders/varda.py new file mode 100644 index 0000000..17215fc --- /dev/null +++ b/dagens_lunch/spiders/varda.py @@ -0,0 +1,87 @@ +import scrapy +from io import BytesIO +from PyPDF2 import PdfReader +from ..items import DagensLunchVarda + +class VardaSpider(scrapy.Spider): + name = "varda" + allowed_domains = ["restaurangvarda.se"] + start_urls = ["https://restaurangvarda.se"] + + def extract_pdf(self, response): + dagens = response.meta.get("dagens") + reader = PdfReader(BytesIO(response.body)) + all_text = reader.pages[0].extract_text().split(" ") + days = [] + day_price = [] + is_pasta = False + for text in all_text: + if text.lower().startswith("lunchmeny"): + dagens["about"] = text + elif (text.lower().startswith("måndag") or text.lower().startswith("tisdag") or text.lower().startswith("onsdag") or + text.lower().startswith("torsdag") or text.lower().startswith("fredag")): + s = text.split(':-') + d, p = self.get_days(s[0]) + print(d) + for i in range(0, d+1): + days.append(s[1]) + day_price.append(p) + elif "fisken" in text.lower(): + s = text.split(":-") + dagens["fish"] = f"{s[1].strip()} ({s[0].split(" ")[-1].strip()})" + elif "gröna" in text.lower(): + s = text.split(":-") + dagens["green"] = f"{s[1].strip()} ({s[0].split(" ")[-1].strip()}kr)" + elif text.lower() == "pasta": + is_pasta = True + elif "pasta" in text.lower() or is_pasta: + is_pasta = False + pasta, salad = text.split("SALLAD") + sallad_price, salad = salad.split(":-") + dagens["salad"] = f"{salad.strip()} ({sallad_price.strip()}kr)" + dagens["pasta"] = [f"{x.strip().replace(":-", "kr")}" for x in list(filter(None, pasta.strip().split("*")))] + elif "burgare" in text.lower(): + s = text.split(":-") + dagens["burger"] = [f"{x.strip()} ({s[0].split(" ")[-1].strip()}kr)" for x in list(filter(None, s[1].split("*")))] + elif "alltid" in text.lower(): + s = text.split(":-") + dagens["always"] = f"{s[2].strip()} (hel: {s[1].split(" ")[-1]}kr / halv: {s[0].split(" ")[-1]}kr)" + elif "grillat" in text.lower(): + s = text.split(":-") + dagens["bbq"] = f"{s[1].strip()} ({s[0].split(" ")[-1].strip()}kr)" + dagens["monday"] = f"{days[0].strip()} ({day_price[0].strip()}kr)" + dagens["tuesday"] = f"{days[1].strip()} ({day_price[1].strip()}kr)" + dagens["wednesday"] = f"{days[2].strip()} ({day_price[2].strip()}kr)" + dagens["thursday"] = f"{days[3].strip()} ({day_price[3].strip()}kr)" + dagens["friday"] = f"{days[4].strip()} ({day_price[4].strip()}kr)" + return dagens + + def get_days(self, string): + days = ["måndag", "tisdag", "onsdag", "torsdag", "fredag"] + if "–" in string: + s = string.split(" ") + price = s[3].strip() + try: + first = days.index(s[0].strip().lower().decode()) + except AttributeError: + first = days.index(s[0].strip().lower()) + + try: + second = days.index(s[2].strip().lower().decode()) + except AttributeError: + second = days.index(s[2].strip().lower()) + + many_days = second - first + return (many_days, price) + return (0, string.split(" ")[-1]) + + def parse(self, response): + lunch_url = "" + for url in response.xpath("//a[contains(@class, 'elementor-sub-item')]/@href").getall(): + if "Lunchmenyn" in url: + lunch_url = url + break + dagens = DagensLunchVarda() + dagens["place"] = "Varda" + yield scrapy.Request(lunch_url, callback=self.extract_pdf, meta={"dagens": dagens}) + return dagens \ No newline at end of file