From 4c877e346fc30b65fba44e917411cf7ef710b393 Mon Sep 17 00:00:00 2001 From: puckoprutt Date: Tue, 18 Feb 2025 18:33:35 +0100 Subject: [PATCH 1/4] added a max spider --- dagens_lunch/items.py | 10 ++++++++++ dagens_lunch/spiders/max.py | 24 ++++++++++++++++++++++++ 2 files changed, 34 insertions(+) create mode 100644 dagens_lunch/spiders/max.py diff --git a/dagens_lunch/items.py b/dagens_lunch/items.py index 157ef62..089a9fc 100644 --- a/dagens_lunch/items.py +++ b/dagens_lunch/items.py @@ -42,3 +42,13 @@ class DagensLunchMalmens(scrapy.Item): friday = scrapy.Field() scraped_by = scrapy.Field() scraped_at = scrapy.Field() + +class DagensLunchMax(scrapy.Item): + place = scrapy.Field() + monday = scrapy.Field() + tuesday = scrapy.Field() + wednesday = scrapy.Field() + thursday = scrapy.Field() + friday = scrapy.Field() + scraped_by = scrapy.Field() + scraped_at = scrapy.Field() diff --git a/dagens_lunch/spiders/max.py b/dagens_lunch/spiders/max.py new file mode 100644 index 0000000..8382e1c --- /dev/null +++ b/dagens_lunch/spiders/max.py @@ -0,0 +1,24 @@ +from datetime import datetime +import scrapy +from ..items import DagensLunchMax + +class MaxSpider(scrapy.Spider): + name = "max" + allowed_domains = ["max.se"] + start_urls = ["https://max.se/maten/meny/maltider/dagens-lunch/"] + + def parse(self, response): + days = [] + lista = response.xpath("//div/div[2]/div[contains(@class, 'o-product-info')]/ul[contains(@class, 'o-product-info__variations')]") + for li in lista.xpath("./li/text()").getall(): + days.append(li.split(" ")[-1]) + return DagensLunchMax( + place="Max", + monday=days[0], + tuesday=days[1], + wednesday=days[2], + thursday=days[3], + friday=days[4], + scraped_by=self.__class__.__name__, + scraped_at=f"{datetime.now().isoformat()}" + ) From 4d4e03d9339c0bfff6bbd37af4dd107480bd3d1c Mon Sep 17 00:00:00 2001 From: puckoprutt Date: Tue, 18 Feb 2025 18:36:49 +0100 Subject: [PATCH 2/4] run all spiders in diffrent threads --- dagens_lunch/settings.py | 8 ++++ run_spiders.py | 89 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 97 insertions(+) create mode 100755 run_spiders.py diff --git a/dagens_lunch/settings.py b/dagens_lunch/settings.py index 53d8c9b..791a524 100644 --- a/dagens_lunch/settings.py +++ b/dagens_lunch/settings.py @@ -89,4 +89,12 @@ ROBOTSTXT_OBEY = False # Set settings whose default value is deprecated to a future-proof value TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" +FEEDS = { + "items.json": { + "format": "json", + "encoding": "utf8" + } +} FEED_EXPORT_ENCODING = "utf-8" + +LOG_ENABLED = False diff --git a/run_spiders.py b/run_spiders.py new file mode 100755 index 0000000..67a6f48 --- /dev/null +++ b/run_spiders.py @@ -0,0 +1,89 @@ +import pathlib +import json + +from scrapy import spiderloader +from scrapy import signals +from scrapy.crawler import CrawlerProcess +from scrapy.signalmanager import dispatcher +from scrapy.utils.project import get_project_settings + +dir = pathlib.Path(__file__).resolve().parent +dir = dir / "latest" +file = dir / "all.json" +valsaren_file = dir / "valsaren.json" +malmens_file = dir / "malmens.json" +heat_file = dir / "heat.json" +max_file = dir / "max.json" + +def dagens_lunch_results(): + results = [] + settings = get_project_settings() + process = CrawlerProcess(settings) + spider_loader = spiderloader.SpiderLoader.from_settings(settings) + + def crawler_results(signal, sender, item, response, spider): + results.append(item) + print(f"{spider.name} is done!") + + + if file.is_file() and file.exists(): + print(f"removing {file.name}") + file.unlink() + if valsaren_file.is_file and valsaren_file.exists(): + print(f"removing {valsaren_file.name}") + valsaren_file.unlink() + if malmens_file.is_file() and malmens_file.exists(): + print(f"removing {malmens_file.name}") + malmens_file.unlink() + if heat_file.is_file and heat_file.exists(): + print(f"removing {heat_file.name}") + heat_file.unlink() + if max_file.is_file() and max_file.exists(): + print(f"removing {max_file.name}") + max_file.unlink() + + dispatcher.connect(crawler_results, signal=signals.item_scraped) + + for spider_name in spider_loader.list(): + print(f"getting menu from {spider_name}") + process.crawl(spider_name) + + process.start() + return results + +if __name__ == "__main__": + _valsaren = None + _malmens = None + _heat = None + _max = None + res = dagens_lunch_results() + with file.open("a") as f: + f.write("[\n") + for idx, item in enumerate(res): + if idx > 0: + f.write(",") + f.write(json.dumps(dict(item), indent=4)) + if item["place"].split(" ")[0].lower() == "valsaren": + _valsaren = json.dumps(dict(item)) + elif item["place"].split(" ")[0].lower() == "malmens": + _malmens = json.dumps(dict(item)) + elif item["place"].split(" ")[0].lower() == "heat": + _heat = json.dumps(dict(item)) + elif item["place"].split(" ")[0].lower() == "max": + _max = json.dumps(dict(item)) + elif item["place"].split(" ")[0].lower() == "unknown": + print(f"please learn me more about {item['place']}") + f.write("\n]") + print(f"created: {file}") + + valsaren_file.write_text(_valsaren) + print(f"created: {valsaren_file}") + + malmens_file.write_text(_malmens) + print(f"created: {malmens_file}") + + heat_file.write_text(_heat) + print(f"created: {heat_file}") + + max_file.write_text(_max) + print(f"created: {max_file}") From d9e7d6dc016b405986d893bdc8a5f4ad0fec9486 Mon Sep 17 00:00:00 2001 From: puckoprutt Date: Tue, 18 Feb 2025 18:39:09 +0100 Subject: [PATCH 3/4] dont include scraped files --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 3d91c19..84c5302 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ .vscode +latest dagens_lunch/spiders/__pycache__/*.pyc -dagens_lunch/__pycache__/*.pyc \ No newline at end of file +dagens_lunch/__pycache__/*.pyclatest From cbb2936d262cc90e669987baf9f4e6bb24a4be21 Mon Sep 17 00:00:00 2001 From: puckoprutt Date: Tue, 18 Feb 2025 17:47:25 +0000 Subject: [PATCH 4/4] Update .gitignore --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 84c5302..13953c1 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,4 @@ .vscode latest dagens_lunch/spiders/__pycache__/*.pyc -dagens_lunch/__pycache__/*.pyclatest +dagens_lunch/__pycache__/*.pyc