commit 25541bf06f3a8af6e719337bf32fb05ac9661acc Author: puckoprutt Date: Mon Feb 17 21:33:36 2025 +0100 first version diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..3d91c19 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +.vscode +dagens_lunch/spiders/__pycache__/*.pyc +dagens_lunch/__pycache__/*.pyc \ No newline at end of file diff --git a/dagens_lunch/__init__.py b/dagens_lunch/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/dagens_lunch/items.py b/dagens_lunch/items.py new file mode 100644 index 0000000..157ef62 --- /dev/null +++ b/dagens_lunch/items.py @@ -0,0 +1,44 @@ +# Define here the models for your scraped items +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class DagensLunchValsaren(scrapy.Item): + place = scrapy.Field() + week = scrapy.Field() + monday = scrapy.Field() + tuesday = scrapy.Field() + wednesday = scrapy.Field() + thursday = scrapy.Field() + friday = scrapy.Field() + alltid = scrapy.Field() + klassiker = scrapy.Field() + price = scrapy.Field() + price_takeaway = scrapy.Field() + scraped_by = scrapy.Field() + scraped_at = scrapy.Field() + +class DagensLunchHeat(scrapy.Item): + place = scrapy.Field() + week = scrapy.Field() + monday = scrapy.Field() + tuesday = scrapy.Field() + wednesday = scrapy.Field() + thursday = scrapy.Field() + friday = scrapy.Field() + scraped_by = scrapy.Field() + scraped_at = scrapy.Field() + +class DagensLunchMalmens(scrapy.Item): + place = scrapy.Field() + week = scrapy.Field() + monday = scrapy.Field() + tuesday = scrapy.Field() + wednesday = scrapy.Field() + thursday = scrapy.Field() + friday = scrapy.Field() + scraped_by = scrapy.Field() + scraped_at = scrapy.Field() diff --git a/dagens_lunch/middlewares.py b/dagens_lunch/middlewares.py new file mode 100644 index 0000000..8539521 --- /dev/null +++ b/dagens_lunch/middlewares.py @@ -0,0 +1,103 @@ +# Define here the models for your spider middleware +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + +# useful for handling different item types with a single interface +from itemadapter import is_item, ItemAdapter + + +class DagensLunchSpiderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, or item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Request or item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info("Spider opened: %s" % spider.name) + + +class DagensLunchDownloaderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info("Spider opened: %s" % spider.name) diff --git a/dagens_lunch/pipelines.py b/dagens_lunch/pipelines.py new file mode 100644 index 0000000..a32cfa3 --- /dev/null +++ b/dagens_lunch/pipelines.py @@ -0,0 +1,13 @@ +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html + + +# useful for handling different item types with a single interface +from itemadapter import ItemAdapter + + +class DagensLunchPipeline: + def process_item(self, item, spider): + return item diff --git a/dagens_lunch/settings.py b/dagens_lunch/settings.py new file mode 100644 index 0000000..53d8c9b --- /dev/null +++ b/dagens_lunch/settings.py @@ -0,0 +1,92 @@ +# Scrapy settings for dagens_lunch project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://docs.scrapy.org/en/latest/topics/settings.html +# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = "dagens_lunch" + +SPIDER_MODULES = ["dagens_lunch.spiders"] +NEWSPIDER_MODULE = "dagens_lunch.spiders" + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64; rv:133.0) puckoprutt edition Gecko/20100101 Firefox/133.0" + +# Obey robots.txt rules +ROBOTSTXT_OBEY = False + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +#CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +#DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +#CONCURRENT_REQUESTS_PER_DOMAIN = 16 +#CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +#COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", +# "Accept-Language": "en", +#} + +# Enable or disable spider middlewares +# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# "dagens_lunch.middlewares.DagensLunchSpiderMiddleware": 543, +#} + +# Enable or disable downloader middlewares +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +#DOWNLOADER_MIDDLEWARES = { +# "dagens_lunch.middlewares.DagensLunchDownloaderMiddleware": 543, +#} + +# Enable or disable extensions +# See https://docs.scrapy.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# "scrapy.extensions.telnet.TelnetConsole": None, +#} + +# Configure item pipelines +# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +#ITEM_PIPELINES = { +# "dagens_lunch.pipelines.DagensLunchPipeline": 300, +#} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = "httpcache" +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage" + +# Set settings whose default value is deprecated to a future-proof value +TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" +FEED_EXPORT_ENCODING = "utf-8" diff --git a/dagens_lunch/spiders/__init__.py b/dagens_lunch/spiders/__init__.py new file mode 100644 index 0000000..ebd689a --- /dev/null +++ b/dagens_lunch/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/dagens_lunch/spiders/heat.py b/dagens_lunch/spiders/heat.py new file mode 100644 index 0000000..c877e94 --- /dev/null +++ b/dagens_lunch/spiders/heat.py @@ -0,0 +1,59 @@ +from datetime import datetime +import xml.etree.ElementTree as ET +import scrapy +import chompjs +from ..items import DagensLunchHeat + +class HeatSpider(scrapy.Spider): + name = "heat" + allowed_domains = ["heatrestauranger.se", "castit.nu"] + start_urls = ["https://heatrestauranger.se/lunch-heat-vasteras-kopparlunden/#veckans-lunchmeny"] + + def extract_xml(self, response): + dagens = response.meta.get("dagens") + tree = ET.fromstring(response.body) + self.week = tree.find(".//rubrik").text + self.monday = [] + self.tuesday = [] + self.wednesday = [] + self.thursday = [] + self.friday = [] + + for idx in range(1, 8): + if not tree.find(f".//mandagratt{idx}rubrik") is None: + self.monday.append({"rubrik": tree.find(f".//mandagratt{idx}rubrik").text, "text": tree.find(f".//mandagratt{idx}text").text}) + if not tree.find(f".//tisdagratt{idx}rubrik") is None: + self.tuesday.append({"rubrik": tree.find(f".//tisdagratt{idx}rubrik").text, "text": tree.find(f".//tisdagratt{idx}text").text}) + if not tree.find(f".//onsdagratt{idx}rubrik") is None: + self.wednesday.append({"rubrik": tree.find(f".//onsdagratt{idx}rubrik").text, "text": tree.find(f".//onsdagratt{idx}text").text}) + if not tree.find(f".//torsdagratt{idx}rubrik") is None: + self.thursday.append({"rubrik": tree.find(f".//torsdagratt{idx}rubrik").text, "text": tree.find(f".//torsdagratt{idx}text").text}) + if not tree.find(f".//fredagratt{idx}rubrik") is None: + self.friday.append({"rubrik": tree.find(f".//fredagratt{idx}rubrik").text, "text": tree.find(f".//fredagratt{idx}text").text}) + + dagens["week"] = self.week + dagens["monday"] = self.monday + dagens["tuesday"] = self.tuesday + dagens["wednesday"] = self.wednesday + dagens["thursday"] = self.thursday + dagens["friday"] = self.friday + return dagens + + def parse(self, response): + scripts = response.xpath("//script[not (@id) and not (@src) and not (@class)]/text()").getall() + url = "" + dagens = DagensLunchHeat() + dagens["place"] = "Heat Kopparlunden" + dagens["scraped_by"] = f"{self.__class__.__name__}" + for js in scripts: + if not "jQuery(function( $ ){" in js: + continue + js = js.split("\n") + for line in js: + line = line.strip() + if "url:\"https://castit.nu/xml" in line: + url = line[5:-2] + yield scrapy.Request(url, callback=self.extract_xml, meta={"dagens": dagens}) + dagens["scraped_at"] = f"{datetime.now().isoformat()}" + + return dagens \ No newline at end of file diff --git a/dagens_lunch/spiders/malmens.py b/dagens_lunch/spiders/malmens.py new file mode 100644 index 0000000..49c65cd --- /dev/null +++ b/dagens_lunch/spiders/malmens.py @@ -0,0 +1,42 @@ +from datetime import datetime +import scrapy +from ..items import DagensLunchMalmens + +class MalmensSpider(scrapy.Spider): + name = "malmens" + allowed_domains = ["gastrogate.com"] + start_urls = ["https://malmens.gastrogate.com/lunchbuffe/"] + + def parse(self, response): + tbodys = response.xpath("//table[contains(@class, 'lunch_menu')]/tbody") + self.monday = [] + self.tuesday = [] + self.wednesday = [] + self.thursday = [] + self.friday = [] + + for idx, body in enumerate(tbodys): + for food in body.xpath("./tr/td[contains(@class, 'td_title')]/text()").getall(): + if idx == 0 and food.strip() != "": + self.monday.append(food.strip()) + elif idx == 1 and food.strip() != "": + self.tuesday.append(food.strip()) + elif idx == 2 and food.strip() != "": + self.wednesday.append(food.strip()) + elif idx == 3 and food.strip() != "": + self.thursday.append(food.strip()) + elif idx == 4 and food.strip() != "": + self.friday.append(food.strip()) + + dagens = DagensLunchMalmens() + dagens["place"] = "Malmens" + dagens["week"] = response.xpath("//a[contains(@class, 'dropdown-toggle')]/text()").get().strip() + dagens["monday"] = self.monday + dagens["tuesday"] = self.tuesday + dagens["wednesday"] = self.wednesday + dagens["thursday"] = self.thursday + dagens["friday"] = self.friday + dagens["scraped_by"] = self.__class__.__name__ + dagens["scraped_at"] = f"{datetime.now().isoformat()}" + + return dagens \ No newline at end of file diff --git a/dagens_lunch/spiders/valsaren.py b/dagens_lunch/spiders/valsaren.py new file mode 100644 index 0000000..3e3a3f5 --- /dev/null +++ b/dagens_lunch/spiders/valsaren.py @@ -0,0 +1,140 @@ +from io import BytesIO +from pathlib import Path +from datetime import datetime +import chompjs +import re +from PyPDF2 import PdfReader +import scrapy +from ..items import DagensLunchValsaren + + +class ValsarenSpider(scrapy.Spider): + name = "valsaren" + allowed_domains = ["valsaren.se", "misssite.com", "files.builder.misssite.com"] + start_urls = ["https://valsaren.se/maten"] + + def __init__(self, name = None, **kwargs): + super().__init__(name, **kwargs) + self.week = "" + self.days = [] + self.klassiker = "" + self.alltid = "" + self.price = "" + self.price_takeaway = "" + + def get_menu_url(self, script): + sp = dict() + for line in script.split("\n"): + l = line.strip() + if l.startswith("Server.comp"): + sp = chompjs.parse_js_object(l) + break + + for obj in sp.keys(): + if "data" in sp[obj].keys(): + if isinstance(sp[obj]["data"], dict): + if "url" in sp[obj]["data"].keys() and "fileName" in sp[obj]["data"].keys(): + if sp[obj]["data"]["fileName"].lower().endswith(".pdf"): + return sp[obj]["data"]["url"] + return None + + def get_menu(self, response): + yield scrapy.Request(response, callback=self.extract_menu) + + def extract_menu(self, response): + dagens = response.meta.get("dagens") + reader = PdfReader(BytesIO(response.body)) + all_text = reader.pages[0].extract_text().split("\n") + dagens["week"] = all_text.pop(0).strip() + words = ["VECKANS KLASSIKER", "ALLTID PÅ VALSAREN"] + current = 0 + jump_over = False + days = [] + klassiker = "" + alltid = "" + for idx, t in enumerate(all_text): + if jump_over: + jump_over = False + continue + + if current == 0b1000: + if words[0] in t: + days.append(t.replace(words[0], "")) + current = 0b10 + elif words[1] in t: + days.append(t.replace(words[1], "")) + current = 0b100 + else: + if t.endswith(","): + days.append(" ".join([all_text[idx], all_text[idx+1]])) + jump_over = True + else: + days.append(t) + current = 0 + + elif current == 0b100: + if t.endswith(","): + alltid = " ".join([all_text[idx], all_text[idx+1]]) + jump_over = True + else: + alltid = t + current = 0 + + elif current == 0b10: + if t.endswith(","): + klassiker = " ".join([all_text[idx], all_text[idx+1]]) + jump_over = True + else: + klassiker = t + current = 0 + + if words[0] in t: + current = 0b10 + elif words[1] in t: + current = 0b100 + elif "måndag" in t.lower(): + current = 0b1000 + elif "tisdag" in t.lower(): + current = 0b1000 + elif "onsdag" in t.lower(): + current = 0b1000 + elif "torsdag" in t.lower(): + current = 0b1000 + elif "fredag" in t.lower(): + current = 0b1000 + else: + current = 0 + + dagens["alltid"] = re.sub(r'(& )(\w)( )', r"\1\2", alltid) + dagens["klassiker"] = re.sub(r'(& )(\w)( )', r"\1\2", klassiker) + dagens["monday"] = re.sub(r'(& )(\w)( )', r"\1\2", days[0]) + dagens["tuesday"] = re.sub(r'(& )(\w)( )', r"\1\2", days[1]) + dagens["wednesday"] = re.sub(r'(& )(\w)( )', r"\1\2", days[2]) + dagens["thursday"] = re.sub(r'(& )(\w)( )', r"\1\2", days[3]) + dagens["friday"] = re.sub(r'(& )(\w)( )', r"\1\2", days[4]) + return dagens + + def parse(self, response): + SECTION_UUID = "Wr8Wn2H9iFvL7hh" + section = response.selector.xpath(f"//section[contains(@data-uniqueid, '{SECTION_UUID}')]") + p = section.xpath("//p[contains(@class, 'contentgroup__body')]/text()").getall() + dagens = DagensLunchValsaren() + dagens["price"] = int(p.pop(0).strip().replace(" kr", "").split(" ")[-1]) + dagens["price_takeaway"] = int(p.pop(0).strip().replace(" kr", "").split(" ")[-1]) + js_scripts = response.xpath("//script[contains(@nonce, '')]/text()").getall() + menu_url = self.get_menu_url(js_scripts[4]) + if menu_url is None: + for x in js_scripts: + url = self.get_menu_url(x) + if not url is None: + menu_url = url + break + if menu_url is None: + raise ValueError("could not get menu.") + + #response.follow(menu_url, callback=self.extract_menu) + yield scrapy.Request(menu_url, callback=self.extract_menu, meta={"dagens": dagens}) + dagens["place"] = "Valsaren - Hotell & Kök" + dagens["scraped_by"] = self.__class__.__name__ + dagens["scraped_at"] = f"{datetime.now().isoformat()}" + return dagens diff --git a/scrapy.cfg b/scrapy.cfg new file mode 100644 index 0000000..fc9a801 --- /dev/null +++ b/scrapy.cfg @@ -0,0 +1,12 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = dagens_lunch.settings +ROBOTSTXT_OBEY = False + +[deploy] +#url = http://localhost:6800/ +project = dagens_lunch