Ai Job Finder
Pricing
Pay per usage
Go to Store
Ai Job Finder
Give a prompt or a CV and find jobs according to you
0.0 (0)
Pricing
Pay per usage
1
Total users
4
Monthly users
3
Runs succeeded
>99%
Last modified
2 months ago
requirements.txt
1apify-client>=1.0.0,<2.0.02openai>=1.0.03anthropic>=0.5.04google-generativeai==0.3.05pydantic>=2.0.0
test.py
1#!/usr/bin/env python32import asyncio3import json4import sys5import os6from typing import Dict, Any, Optional7
8# Import from our modules9from src.llm_providers.factory import create_llm_provider10from src.cv_processor import process_cv11from src.prompt_processor import process_prompt12from src.parameter_handler import apply_parameter_defaults13
14async def test_cv_processing():15 """Test CV processing with a local file"""16 # Check if file path was provided17 if len(sys.argv) < 2:18 print("Usage: python test.py path/to/cv.pdf [prompt]")19 sys.exit(1)20 21 # Get CV file path and optional prompt22 cv_path = sys.argv[1]23 prompt = sys.argv[2] if len(sys.argv) > 2 else None24 25 # Check if API key is set26 openai_key = os.environ.get("OPENAI_API_KEY")27 if not openai_key:28 print("ERROR: OPENAI_API_KEY environment variable not set.")29 print("Please set it with: export OPENAI_API_KEY=your-api-key")30 sys.exit(1)31 32 # Read CV file33 try:34 with open(cv_path, "rb") as f:35 cv_data = f.read()36 37 # Convert to base64 for testing38 import base6439 import mimetypes40 mime_type, _ = mimetypes.guess_type(cv_path)41 if not mime_type:42 mime_type = "application/octet-stream"43 44 cv_data_base64 = f"data:{mime_type};base64,{base64.b64encode(cv_data).decode('utf-8')}"45 except Exception as e:46 print(f"Error reading CV file: {str(e)}")47 sys.exit(1)48 49 # Create LLM provider50 provider = create_llm_provider("openai", openai_key)51 52 # Process CV53 print("Processing CV...")54 cv_parameters = await process_cv(cv_data_base64, provider, "openai")55 print(f"CV Parameters: {json.dumps(cv_parameters, indent=2)}")56 57 # Process prompt if provided58 prompt_parameters = {}59 if prompt:60 print("\nProcessing prompt...")61 prompt_parameters = await process_prompt(prompt, provider)62 print(f"Prompt Parameters: {json.dumps(prompt_parameters, indent=2)}")63 64 # Merge and apply defaults65 parameters = {**cv_parameters, **prompt_parameters}66 final_parameters = apply_parameter_defaults(parameters)67 68 print("\nFinal LinkedIn Search Parameters:")69 print(json.dumps(final_parameters, indent=2))70 71 # Note: This test doesn't actually call the LinkedIn scraper72 print("\nTest complete. To perform a real LinkedIn search, upload this Actor to Apify.")73
74if __name__ == "__main__":75 asyncio.run(test_cv_processing())
.actor/actor.json
{ "actorSpecification": 1, "name": "ai-job-finder", "title": "AI Job Finder", "description": "An AI-powered tool that reads a CV and/or prompt to find relevant jobs on LinkedIn", "version": "0.1", "buildTag": "latest", "meta": { "templateId": "python-apify" }, "input": "./input_schema.json", "dockerfile": "./Dockerfile"}
.actor/Dockerfile
# First specify the base Docker image.FROM apify/actor-python:3.12
# Copy requirements.txt into the Actor imageCOPY requirements.txt ./
# Install the packages specified in requirements.txtRUN echo "Python version:" \ && python --version \ && echo "Pip version:" \ && pip --version \ && echo "Installing dependencies:" \ && pip install -r requirements.txt \ && echo "All installed Python packages:" \ && pip freeze
# Copy the remaining files and directories with the source codeCOPY . ./
# Use compileall to ensure the runnability of the Actor Python codeRUN python3 -m compileall -q .
# Specify how to launch the source code of your ActorCMD ["python3", "-m", "src"]
.actor/INPUT_SCHEMA.json
{ "title": "AI Job Finder", "description": "An AI-powered tool that reads a CV and/or prompt to find relevant jobs on LinkedIn", "type": "object", "schemaVersion": 1, "properties": { "cv": { "title": "CV/Resume", "type": "object", "description": "Upload your CV/resume (PDF, DOCX, TXT formats supported) as Base64 encoded string", "editor": "file", "nullable": true }, "prompt": { "title": "Job Search Query", "type": "string", "description": "Describe the job you're looking for (e.g., 'Senior Python Developer in New York')", "editor": "textarea", "default": "I'm looking for remote senior software engineering roles in AI companies. I have 5 years of experience with Python and machine learning.", "nullable": true }, "llm_settings": { "title": "LLM Provider Settings", "type": "object", "description": "Configure which LLM provider to use", "editor": "json", "default": { "provider": "gemini", "model": "gemini-1.5-pro" }, "prefill": { "provider": "gemini", "model": "gemini-1.5-pro" } }, "api_keys": { "title": "API Keys", "type": "object", "description": "API keys for LLM providers (optional - defaults to environment variables)", "editor": "json", "default": {}, "prefill": { "openai": "", "claude": "", "gemini": "" } }, "linkedin_search_params": { "title": "Additional LinkedIn Search Parameters", "type": "object", "description": "Override specific LinkedIn search parameters", "editor": "json", "nullable": true }, "proxy": { "title": "Proxy Configuration", "type": "object", "description": "Configure Apify proxy for LinkedIn scraping", "editor": "proxy", "default": { "useApifyProxy": true, "apifyProxyGroups": ["RESIDENTIAL"] } } }, "required": []}
src/cv_processor.py
1import logging2import base643import json4import re5from typing import Dict, Any, Optional6
7logger = logging.getLogger(__name__)8
9async def process_cv(cv_data: str, llm_provider, provider_name: str) -> Dict[str, Any]:10 """11 Process CV data using the appropriate LLM provider12 13 Args:14 cv_data: CV data (either base64 encoded file or plain text)15 llm_provider: The LLM provider instance to use16 provider_name: Name of the provider ('openai', 'claude', or 'gemini')17 18 Returns:19 Dictionary of extracted parameters for LinkedIn job search20 """21 try:22 logger.info(f"Processing CV with {provider_name} provider")23 24 # Process CV with the provider25 cv_parameters = await llm_provider.process_cv(cv_data)26 27 # Validate and clean the parameters28 cv_parameters = validate_cv_parameters(cv_parameters)29 30 logger.info(f"Successfully extracted parameters from CV: {json.dumps(cv_parameters, indent=2)}")31 return cv_parameters32 33 except Exception as e:34 logger.error(f"Error processing CV: {str(e)}")35 # Return empty parameters, which will use defaults later36 return {}37
38def validate_cv_parameters(parameters: Dict[str, Any]) -> Dict[str, Any]:39 """40 Validate and clean the parameters extracted from the CV41 42 Args:43 parameters: Raw parameters extracted by the LLM44 45 Returns:46 Cleaned and validated parameters47 """48 cleaned = {}49 50 # Clean and validate title51 if "title" in parameters and parameters["title"]:52 cleaned["title"] = str(parameters["title"]).strip()53 54 # Clean and validate location55 if "location" in parameters and parameters["location"]:56 cleaned["location"] = str(parameters["location"]).strip()57 58 # Clean and validate experienceLevel59 if "experienceLevel" in parameters and parameters["experienceLevel"]:60 exp_level = str(parameters["experienceLevel"]).strip()61 # Ensure it's a number from 1-562 if exp_level in ["1", "2", "3", "4", "5"]:63 cleaned["experienceLevel"] = exp_level64 65 # Clean and validate workType66 if "workType" in parameters and parameters["workType"]:67 work_type = str(parameters["workType"]).strip()68 # Ensure it's a valid work type (1, 2, or 3)69 if work_type in ["1", "2", "3"]:70 cleaned["workType"] = work_type71 72 # Clean and validate contractType73 if "contractType" in parameters and parameters["contractType"]:74 contract_type = str(parameters["contractType"]).strip().upper()75 # Ensure it's a valid contract type (F, P, C, T, I, or V)76 if contract_type in ["F", "P", "C", "T", "I", "V"]:77 cleaned["contractType"] = contract_type78 79 # Clean and validate skills (might be used for custom filtering later)80 if "skills" in parameters and isinstance(parameters["skills"], list):81 cleaned["skills"] = [str(skill).strip() for skill in parameters["skills"] if skill]82 83 return cleaned
src/main.py
1#!/usr/bin/env python32from apify import Actor3import logging4import json5import base646import re7import os8from typing import Dict, List, Any, Optional9
10# Import providers11from .llm_providers.factory import create_llm_provider12from .cv_processor import process_cv13from .prompt_processor import process_prompt14from .parameter_handler import apply_parameter_defaults15
16# Set up logging17logging.basicConfig(level=logging.INFO)18logger = logging.getLogger(__name__)19
20async def main():21 """Main entry point for the Actor"""22 # Initialize the Actor23 await Actor.init()24 25 # Get input from the actor26 actor_input = await Actor.get_input() or {}27 28 # Validate input - require at least CV or prompt29 cv_data = actor_input.get("cv")30 prompt = actor_input.get("prompt")31 32 if not cv_data and not prompt:33 raise ValueError("At least one of CV or prompt must be provided")34 35 # Get LLM settings36 llm_settings = actor_input.get("llm_settings", {"provider": "gemini", "model": "gemini-1.5-pro"})37 provider_name = llm_settings.get("provider", "gemini")38 39 # Get API key - first from input, then from environment variables40 api_keys = actor_input.get("api_keys", {})41 api_key = api_keys.get(provider_name)42 43 # If no API key in input, try to get from environment variables44 if not api_key:45 if provider_name == "openai":46 api_key = os.getenv("OPENAI_API_KEY")47 elif provider_name == "gemini":48 api_key = os.getenv("GEMINI_API_KEY")49 elif provider_name == "claude":50 api_key = os.getenv("CLAUDE_API_KEY")51 52 # If no API key was found, we can't proceed with LLM processing53 if not api_key:54 logger.warning(f"No API key provided for {provider_name}")55 await Actor.push_data([{56 "title": "LLM API KEY IS NEEDED",57 "description": f"Please provide an API key for {provider_name.upper()} to use this Actor",58 "instructions": f"Set the {provider_name.upper()}_API_KEY environment variable or provide it in the api_keys input parameter",59 "location": "N/A",60 "companyName": "AI Job Finder",61 "experienceLevel": "N/A",62 "workType": "N/A",63 "contractType": "N/A",64 "publishedAt": "N/A",65 "message": f"API key for {provider_name} is required to get real results"66 }])67 logger.info("Returned message indicating API key is needed")68 return69 70 # Create LLM provider for processing71 model = llm_settings.get("model")72 if provider_name == "gemini" and not model:73 model = "gemini-1.5-pro"74 75 logger.info(f"Using LLM provider: {provider_name} with model: {model}")76 llm_provider = create_llm_provider(provider_name, api_key, model)77 78 # Process parameters79 parameters = {}80 81 # Extract parameters from CV and/or prompt82 if cv_data:83 logger.info("Processing CV...")84 cv_parameters = await process_cv(cv_data, llm_provider, provider_name)85 parameters.update(cv_parameters)86 87 if prompt:88 logger.info("Processing prompt...")89 try:90 prompt_parameters = await process_prompt(prompt, llm_provider)91 # Prompt parameters override CV parameters92 parameters.update(prompt_parameters)93 except Exception as e:94 logger.error(f"Error processing prompt: {str(e)}")95 # Continue with default parameters96 97 # Apply any explicit parameters from input98 linkedin_params = actor_input.get("linkedin_search_params", {})99 if linkedin_params:100 parameters.update(linkedin_params)101 102 # Apply defaults for missing parameters103 parameters = apply_parameter_defaults(parameters)104 105 # Set proxy configuration106 if "proxy_configuration" in actor_input:107 parameters["proxy"] = actor_input["proxy_configuration"]108 elif "proxy" in actor_input:109 parameters["proxy"] = actor_input["proxy"]110 111 # Log the parameters we'll use112 logger.info(f"Using LinkedIn search parameters: {json.dumps(parameters, indent=2)}")113 114 # Call LinkedIn scraper115 logger.info("Calling LinkedIn scraper with parameters")116 try:117 jobs = await call_linkedin_scraper(parameters)118 119 # Save output120 await Actor.push_data(jobs)121 logger.info(f"Found {len(jobs)} matching jobs")122 except Exception as e:123 logger.error(f"Error calling LinkedIn scraper: {str(e)}")124 # Return a meaningful error to the user125 await Actor.push_data([{126 "title": "Error Connecting to LinkedIn Scraper",127 "description": f"An error occurred while trying to connect to the LinkedIn Jobs Scraper: {str(e)}",128 "error": True,129 "parameters": parameters130 }])131
132async def call_linkedin_scraper(parameters):133 """Call the LinkedIn scraper with the given parameters"""134 # Prepare the Actor input135 run_input = {136 "title": parameters.get("title", ""),137 "location": parameters.get("location", ""),138 "companyName": parameters.get("companyName", []),139 "companyId": parameters.get("companyId", []),140 "workType": parameters.get("workType", ""),141 "experienceLevel": parameters.get("experienceLevel", ""),142 "contractType": parameters.get("contractType", ""),143 "publishedAt": parameters.get("publishedAt", ""),144 "rows": parameters.get("rows", 10),145 "proxy": parameters.get("proxy", {146 "useApifyProxy": True,147 "apifyProxyGroups": ["RESIDENTIAL"]148 })149 }150 151 # Run the Actor and wait for it to finish using Actor.apify_client152 # This automatically handles the authentication - no need for explicit API key153 run = await Actor.apify_client.actor("BHzefUZlZRKWxkTck").call(run_input=run_input)154 155 # Fetch and return the Actor's output156 dataset_items = await Actor.apify_client.dataset(run["defaultDatasetId"]).list_items()157 return dataset_items.items
src/parameter_handler.py
1import logging2from typing import Dict, Any3
4logger = logging.getLogger(__name__)5
6def apply_parameter_defaults(parameters: Dict[str, Any]) -> Dict[str, Any]:7 """8 Apply default values for missing parameters9
10 Args:11 parameters: Current set of parameters12
13 Returns:14 Parameters with defaults applied15 """16 # Create a copy of the parameters to avoid modifying the original17 final_params = parameters.copy()18
19 # Check for title (required parameter)20 if "title" not in final_params or not final_params["title"]:21 final_params["title"] = "Software Engineer" # Default job title22 logger.info("Using default job title: 'Software Engineer'")23
24 # Set default location if not provided25 if "location" not in final_params or not final_params["location"]:26 final_params["location"] = "United States" # Country is required, default to United States27 logger.info("Using default location: United States")28
29 # Set default experience level if not provided30 if "experienceLevel" not in final_params or not final_params["experienceLevel"]:31 final_params["experienceLevel"] = "3" # Associate32 logger.info("Using default experience level: 3 (Associate)")33
34 # Set default work type if not provided35 if "workType" not in final_params or not final_params["workType"]:36 final_params["workType"] = "" # Empty string means any work type37 logger.info("Using default work type: any")38
39 # Set default contract type if not provided40 if "contractType" not in final_params or not final_params["contractType"]:41 final_params["contractType"] = "F" # Full-time42 logger.info("Using default contract type: F (Full-Time)")43
44 # Set default published at if not provided45 if "publishedAt" not in final_params or not final_params["publishedAt"]:46 final_params["publishedAt"] = "" # Empty string means any time47 logger.info("Using default time frame: any time")48
49 # Set default company name if not provided50 if "companyName" not in final_params or not final_params["companyName"]:51 final_params["companyName"] = [] # Empty list means any company52 logger.info("Using default company name: any company")53
54 # Set default company ID if not provided55 if "companyId" not in final_params or not final_params["companyId"]:56 final_params["companyId"] = [] # Empty list means any company ID57 logger.info("Using default company ID: any company ID")58
59 # Set default rows if not provided60 if "rows" not in final_params or not final_params["rows"]:61 final_params["rows"] = 10 # Default to 10 results62 logger.info("Using default rows: 10")63
64 # Ensure we have proper proxy configuration65 if "proxy" not in final_params or not final_params["proxy"]:66 final_params["proxy"] = {67 "useApifyProxy": True,68 "apifyProxyGroups": ["RESIDENTIAL"]69 }70 logger.info("Using default proxy configuration")71
72 return final_params
src/prompt_processor.py
1import logging2import json3from typing import Dict, Any, Optional4
5logger = logging.getLogger(__name__)6
7async def process_prompt(prompt: str, llm_provider) -> Dict[str, Any]:8 """9 Process user prompt and extract job search parameters10 11 Args:12 prompt: User's job search query13 llm_provider: The LLM provider instance to use14 15 Returns:16 Dictionary of extracted parameters for LinkedIn job search17 """18 try:19 logger.info("Processing user prompt")20 21 # Process prompt with the provider22 prompt_parameters = await llm_provider.process_prompt(prompt)23 24 # Validate and clean the parameters25 prompt_parameters = validate_prompt_parameters(prompt_parameters)26 27 logger.info(f"Successfully extracted parameters from prompt: {json.dumps(prompt_parameters, indent=2)}")28 return prompt_parameters29 30 except Exception as e:31 logger.error(f"Error processing prompt: {str(e)}")32 # Return empty parameters, which will use defaults later33 return {}34
35def validate_prompt_parameters(parameters: Dict[str, Any]) -> Dict[str, Any]:36 """37 Validate and clean the parameters extracted from the prompt38 39 Args:40 parameters: Raw parameters extracted by the LLM41 42 Returns:43 Cleaned and validated parameters44 """45 cleaned = {}46 47 # Clean and validate title48 if "title" in parameters and parameters["title"]:49 cleaned["title"] = str(parameters["title"]).strip()50 51 # Clean and validate location52 if "location" in parameters and parameters["location"]:53 cleaned["location"] = str(parameters["location"]).strip()54 55 # Clean and validate experienceLevel56 if "experienceLevel" in parameters and parameters["experienceLevel"]:57 exp_level = str(parameters["experienceLevel"]).strip()58 # Ensure it's a number from 1-559 if exp_level in ["1", "2", "3", "4", "5"]:60 cleaned["experienceLevel"] = exp_level61 62 # Clean and validate workType63 if "workType" in parameters and parameters["workType"]:64 work_type = str(parameters["workType"]).strip()65 # Ensure it's a valid work type (1, 2, or 3)66 if work_type in ["1", "2", "3"]:67 cleaned["workType"] = work_type68 69 # Clean and validate contractType70 if "contractType" in parameters and parameters["contractType"]:71 contract_type = str(parameters["contractType"]).strip().upper()72 # Ensure it's a valid contract type (F, P, C, T, I, or V)73 if contract_type in ["F", "P", "C", "T", "I", "V"]:74 cleaned["contractType"] = contract_type75 76 # Clean and validate publishedAt77 if "publishedAt" in parameters and parameters["publishedAt"]:78 published_at = str(parameters["publishedAt"]).strip()79 # Ensure it's a valid time frame80 if published_at in ["r86400", "r604800", "r2592000", ""]:81 cleaned["publishedAt"] = published_at82 83 # Clean and validate rows84 if "rows" in parameters and parameters["rows"]:85 try:86 rows = int(parameters["rows"])87 if rows > 0:88 cleaned["rows"] = rows89 except (ValueError, TypeError):90 pass91 92 # Clean and validate companyName93 if "companyName" in parameters and isinstance(parameters["companyName"], list):94 cleaned["companyName"] = [str(company).strip() for company in parameters["companyName"] if company]95 96 # Clean and validate companyId97 if "companyId" in parameters and isinstance(parameters["companyId"], list):98 cleaned["companyId"] = [str(company_id).strip() for company_id in parameters["companyId"] if company_id]99 100 return cleaned
src/__init__.py
1# AI Job Finder package
src/__main__.py
1import asyncio2
3from .main import main4
5# Execute the Actor entrypoint6asyncio.run(main())
__pycache__/main.cpython-312.pyc
Downloadexample/advanced-reddit-scraper/.dockerignore
.git.mise.toml.nvim.luastorage
# The rest is copied from https://github.com/github/gitignore/blob/main/Python.gitignore
# Byte-compiled / optimized / DLL files__pycache__/*.py[cod]*$py.class
# C extensions*.so
# Distribution / packaging.Pythonbuild/develop-eggs/dist/downloads/eggs/.eggs/lib/lib64/parts/sdist/var/wheels/share/python-wheels/*.egg-info/.installed.cfg*.eggMANIFEST
# PyInstaller# Usually these files are written by a python script from a template# before PyInstaller builds the exe, so as to inject date/other infos into it.*.manifest*.spec
# Installer logspip-log.txtpip-delete-this-directory.txt
# Unit test / coverage reportshtmlcov/.tox/.nox/.coverage.coverage.*.cachenosetests.xmlcoverage.xml*.cover*.py,cover.hypothesis/.pytest_cache/cover/
# Translations*.mo*.pot
# Django stuff:*.loglocal_settings.pydb.sqlite3db.sqlite3-journal
# Flask stuff:instance/.webassets-cache
# Scrapy stuff:.scrapy
# Sphinx documentationdocs/_build/
# PyBuilder.pybuilder/target/
# Jupyter Notebook.ipynb_checkpoints
# IPythonprofile_default/ipython_config.py
# pyenv# For a library or package, you might want to ignore these files since the code is# intended to run in multiple environments; otherwise, check them in:.python-version
# pdm# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.#pdm.lock# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it# in version control.# https://pdm.fming.dev/latest/usage/project/#working-with-version-control.pdm.toml.pdm-python.pdm-build/
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm__pypackages__/
# Celery stuffcelerybeat-schedulecelerybeat.pid
# SageMath parsed files*.sage.py
# Environments.env.venvenv/venv/ENV/env.bak/venv.bak/
# Spyder project settings.spyderproject.spyproject
# Rope project settings.ropeproject
# mkdocs documentation/site
# mypy.mypy_cache/.dmypy.jsondmypy.json
# Pyre type checker.pyre/
# pytype static type analyzer.pytype/
# Cython debug symbolscython_debug/
# PyCharm# JetBrains specific template is maintained in a separate JetBrains.gitignore that can# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore# and can be added to the global gitignore or merged into this file. For a more nuclear# option (not recommended) you can uncomment the following to ignore the entire idea folder..idea/
example/advanced-reddit-scraper/.gitignore
.mise.toml.nvim.luastorage
# The rest is copied from https://github.com/github/gitignore/blob/main/Python.gitignore
# Byte-compiled / optimized / DLL files__pycache__/*.py[cod]*$py.class
# C extensions*.so
# Distribution / packaging.Pythonbuild/develop-eggs/dist/downloads/eggs/.eggs/lib/lib64/parts/sdist/var/wheels/share/python-wheels/*.egg-info/.installed.cfg*.eggMANIFEST
# PyInstaller# Usually these files are written by a python script from a template# before PyInstaller builds the exe, so as to inject date/other infos into it.*.manifest*.spec
# Installer logspip-log.txtpip-delete-this-directory.txt
# Unit test / coverage reportshtmlcov/.tox/.nox/.coverage.coverage.*.cachenosetests.xmlcoverage.xml*.cover*.py,cover.hypothesis/.pytest_cache/cover/
# Translations*.mo*.pot
# Django stuff:*.loglocal_settings.pydb.sqlite3db.sqlite3-journal
# Flask stuff:instance/.webassets-cache
# Scrapy stuff:.scrapy
# Sphinx documentationdocs/_build/
# PyBuilder.pybuilder/target/
# Jupyter Notebook.ipynb_checkpoints
# IPythonprofile_default/ipython_config.py
# pyenv# For a library or package, you might want to ignore these files since the code is# intended to run in multiple environments; otherwise, check them in:.python-version
# pdm# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.#pdm.lock# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it# in version control.# https://pdm.fming.dev/latest/usage/project/#working-with-version-control.pdm.toml.pdm-python.pdm-build/
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm__pypackages__/
# Celery stuffcelerybeat-schedulecelerybeat.pid
# SageMath parsed files*.sage.py
# Environments.env.venvenv/venv/ENV/env.bak/venv.bak/
# Spyder project settings.spyderproject.spyproject
# Rope project settings.ropeproject
# mkdocs documentation/site
# mypy.mypy_cache/.dmypy.jsondmypy.json
# Pyre type checker.pyre/
# pytype static type analyzer.pytype/
# Cython debug symbolscython_debug/
# PyCharm# JetBrains specific template is maintained in a separate JetBrains.gitignore that can# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore# and can be added to the global gitignore or merged into this file. For a more nuclear# option (not recommended) you can uncomment the following to ignore the entire idea folder..idea/
# Added by Apify CLInode_modules
example/advanced-reddit-scraper/README.md
1# 🚀 Advanced Reddit Scraper for Apify Actors - Lightning Fast & SEO Optimized2
3Unlock the full potential of Reddit data with our Advanced Reddit Scraper designed for the Apify platform. This high-performance tool uses lightning-fast requests to extract extensive subreddit information, providing researchers, marketers, and data enthusiasts with unparalleled social media insights.4
5## 📊 Comprehensive Reddit Data Extraction6
7Our Reddit Scraper offers a robust set of features that allow you to collect detailed data from any subreddit using Apify's powerful actor architecture. Enjoy rapid scraping with optimal performance while taking advantage of customizable settings tailored for your data requirements.8
9### 🔥 Key SEO-Optimized Features10
11- **Full Subreddit Scraping**: Extract every detail from the target subreddits, capturing posts, comments, and metadata.12- **Customizable Data Fields**: Configure exactly what you're after, ensuring that you only get the data that matters.13- **Lightning Fast Performance**: Utilizes Python requests for rapid data retrieval, so you never miss a trending topic.14- **Scalable Data Collection**: Effortlessly scrape multiple subreddits simultaneously, ideal for large-scale data mining.15- **Real-Time Insights**: Obtain the most current Reddit information, perfect for real-time analytics and trend monitoring.16- **Easy Integration with Data Pipelines**: Seamlessly export data in various formats (JSON, CSV, etc.) for immediate analysis.17
18### 🌟 Use Cases for Maximum Impact19
201. **Market Research & Trend Analysis**: Monitor public opinion and identify trending topics across subreddits.212. **Content Creation & Optimization**: Discover viral posts and themes to inspire your content strategy.223. **Sentiment Analysis**: Analyze user reactions and sentiments using detailed comment extraction.234. **Competitive Intelligence**: Stay ahead by tracking competitor mentions and industry-specific discussions.245. **Academic & Social Media Research**: Gather comprehensive data for scholarly studies and social behavior analysis.25
26### 🛠️ How It Works27
281. **Input Parameters**: 29 - **Queries**: Provide one or more subreddit URLs in the format `https://reddit.com/r/<subreddit>`.30 - **Post Sorting**: Choose how posts are sorted (e.g., `hot`, `new`, `top`, or `rising`). 31 - **Top Period**: Specify the period for top posts (e.g., `day`, `week`, or `all`).32 - **Max Posts**: Set the maximum number of posts to scrape per subreddit.33 - **Comment Sorting**: Define the method to sort comments (e.g., `best`, `top`, `new`).34 - **Number of Comments**: Determine how many comments (and a few nested replies) to extract per post.352. **Execution**: Our scraper efficiently navigates Reddit using HTTP requests, ensuring quick and reliable data extraction while strictly following Reddit's guidelines.363. **Output**: Receive clean, structured data ready for analysis and integration into your existing workflows.37
38### 📈 Why Our Reddit Scraper Stands Out39
40- **Comprehensive Data Collection**: Capture every available piece of information from the subreddits you track.41- **High-Speed Requests**: Leveraging the fastest possible scraping techniques to give you immediate insights.42- **Customizable & Flexible**: Tailor the scraping process to meet diverse and specific data needs.43- **Enterprise-Grade Scalability**: Perfect for both small-scale projects and large-scale data operations.44- **Ethical & Compliant**: Adheres to Reddit’s data usage policies, including respecting robots.txt and API guidelines.45
46### 🔗 Essential Resources47
48- [Apify Platform](https://apify.com)49- [Actor Documentation](https://docs.apify.com/actors)50- [API Reference](https://docs.apify.com/api/v2)51
52### 📞 Expert Support When You Need It53
54For further assistance or inquiries, feel free to reach out:55- 📧 Email: tnot2652@gmail.com56
57### 🚀 Ready to Upgrade Your Data Game?58
59Don't miss out on vital Reddit insights. Enhance your data strategy and make informed decisions with our Advanced Reddit Scraper. Start scraping smarter and faster on Apify today!
example/advanced-reddit-scraper/requirements.txt
1# Feel free to add your Python dependencies below. For formatting guidelines, see:2# https://pip.pypa.io/en/latest/reference/requirements-file-format/3
4apify == 2.2.15beautifulsoup4[lxml]6httpx7types-beautifulsoup4
src/llm_providers/base_provider.py
1from abc import ABC, abstractmethod2from typing import Dict, Any, Optional3
4class LLMProvider(ABC):5 """Base abstract class for LLM providers"""6 7 def __init__(self, api_key: str, model: Optional[str] = None):8 """9 Initialize the LLM provider10 11 Args:12 api_key: API key for the provider13 model: Optional specific model to use14 """15 self.api_key = api_key16 self.model = model17 18 @abstractmethod19 async def process_cv(self, cv_data: str) -> Dict[str, Any]:20 """21 Process CV data and extract job search parameters22 23 Args:24 cv_data: CV content (could be base64 encoded file or text)25 26 Returns:27 Dictionary of extracted parameters28 """29 pass30 31 @abstractmethod32 async def process_prompt(self, prompt: str) -> Dict[str, Any]:33 """34 Process user prompt and extract job search parameters35 36 Args:37 prompt: User's job search query38 39 Returns:40 Dictionary of extracted parameters41 """42 pass43 44 @abstractmethod45 async def validate_api_key(self) -> bool:46 """47 Validate that the API key is correct48 49 Returns:50 True if valid, False otherwise51 """52 pass53 54 def supports_document_processing(self) -> bool:55 """56 Check if the provider and model support direct document processing57 58 Returns:59 True if document processing is supported, False otherwise60 """61 return False
src/llm_providers/claude_provider.py
1import json2import logging3import re4import base645from typing import Dict, Any, Optional, List6
7from anthropic import AsyncAnthropic8from src.llm_providers.base_provider import LLMProvider9
10logger = logging.getLogger(__name__)11
12class ClaudeProvider(LLMProvider):13 """Implementation of LLM provider for Anthropic Claude"""14 15 def __init__(self, api_key: str, model: Optional[str] = None):16 """Initialize the Claude provider"""17 super().__init__(api_key, model)18 self.client = AsyncAnthropic(api_key=api_key)19 self.model = model or "claude-3-opus-20240229" # Default to most capable model20 21 def supports_document_processing(self) -> bool:22 """Check if this provider/model supports direct document processing"""23 return "claude-3" in self.model # All Claude 3 models support document processing24 25 async def validate_api_key(self) -> bool:26 """Validate the API key by making a simple models call"""27 try:28 # There's no direct way to validate the key without making a message request29 # Use a minimal request to check if the key works30 await self.client.messages.create(31 model=self.model,32 max_tokens=10,33 messages=[{"role": "user", "content": "Hello"}]34 )35 return True36 except Exception as e:37 logger.error(f"Claude API key validation failed: {str(e)}")38 return False39 40 async def process_cv(self, cv_data: str) -> Dict[str, Any]:41 """42 Process CV with Claude43 44 Args:45 cv_data: CV content (could be base64 encoded file or text)46 47 Returns:48 Dictionary of extracted parameters49 """50 if self.supports_document_processing() and cv_data.startswith("data:"):51 return await self._process_cv_with_document_api(cv_data)52 else:53 # Assume it's already text54 return await self._process_cv_text(cv_data)55 56 async def _process_cv_with_document_api(self, cv_data: str) -> Dict[str, Any]:57 """Process CV using Claude's document capabilities"""58 try:59 # Extract the mime type and base64 data60 mime_type, encoded_data = cv_data.split(';base64,', 1)61 mime_type = mime_type.replace('data:', '')62 63 response = await self.client.messages.create(64 model=self.model,65 max_tokens=4000,66 system="Extract job search parameters from this CV/resume.",67 messages=[68 {"role": "user", "content": [69 {"type": "text", "text": self._get_cv_prompt()},70 {"type": "image", "source": {71 "type": "base64",72 "media_type": mime_type,73 "data": encoded_data74 }}75 ]}76 ]77 )78 79 # Extract JSON from response80 content = response.content[0].text81 # Find JSON in the content (handle potential text wrapping)82 json_match = re.search(r'```json\s*(.*?)\s*```', content, re.DOTALL)83 if json_match:84 return json.loads(json_match.group(1))85 86 # If no JSON block, try to parse the entire content87 return json.loads(content)88 except Exception as e:89 logger.error(f"Claude document processing failed: {str(e)}")90 raise91 92 async def _process_cv_text(self, cv_text: str) -> Dict[str, Any]:93 """Process CV text with Claude"""94 try:95 response = await self.client.messages.create(96 model=self.model,97 max_tokens=4000,98 system="Extract job search parameters from this CV/resume.",99 messages=[100 {"role": "user", "content": self._get_cv_prompt() + f"\n\nCV TEXT:\n{cv_text}"}101 ]102 )103 104 # Extract JSON from response105 content = response.content[0].text106 # Find JSON in the content (handle potential text wrapping)107 json_match = re.search(r'```json\s*(.*?)\s*```', content, re.DOTALL)108 if json_match:109 return json.loads(json_match.group(1))110 111 # If no JSON block, try to parse the entire content112 return json.loads(content)113 except Exception as e:114 logger.error(f"Claude text processing failed: {str(e)}")115 raise116 117 async def process_prompt(self, prompt: str) -> Dict[str, Any]:118 """Process user prompt and extract job search parameters"""119 try:120 response = await self.client.messages.create(121 model=self.model,122 max_tokens=4000,123 system="Extract job search parameters from this query.",124 messages=[125 {"role": "user", "content": self._get_prompt_extraction_prompt() + f"\n\nUSER QUERY:\n{prompt}"}126 ]127 )128 129 # Extract JSON from response130 content = response.content[0].text131 # Find JSON in the content (handle potential text wrapping)132 json_match = re.search(r'```json\s*(.*?)\s*```', content, re.DOTALL)133 if json_match:134 return json.loads(json_match.group(1))135 136 # If no JSON block, try to parse the entire content137 return json.loads(content)138 except Exception as e:139 logger.error(f"Claude prompt processing failed: {str(e)}")140 raise141 142 def _get_cv_prompt(self) -> str:143 """Get the prompt for CV analysis"""144 return """145 Extract the following job search parameters from this CV/resume.146 147 Return your response as valid JSON object inside ```json code blocks with the following structure:148 149 ```json150 {151 "title": "The most recent job title or professional role",152 "location": "Current or preferred location",153 "experienceLevel": "A numeric value from 1-5 where:154 1 = Internship155 2 = Entry Level156 3 = Associate157 4 = Mid-Senior Level158 5 = Director",159 "workType": "Either:160 1 = On-Site161 2 = Remote162 3 = Hybrid163 Based on any workstyle preferences found in the CV",164 "contractType": "A single letter representing employment type preference:165 F = Full-Time166 P = Part-Time167 C = Contract168 T = Temporary169 I = Internship170 V = Volunteer",171 "skills": ["list", "of", "key", "technical", "and", "soft", "skills"]172 }173 ```174 175 If a piece of information is not clearly stated in the CV, make a reasonable inference based on the available information. If inference is not possible, use null.176 177 IMPORTANT: Your output must be a valid JSON object wrapped in ```json code blocks.178 """179 180 def _get_prompt_extraction_prompt(self) -> str:181 """Get the prompt for extracting parameters from user query"""182 return """183 Extract LinkedIn job search parameters from this query.184 185 Return your response as valid JSON object inside ```json code blocks with the following structure:186 187 ```json188 {189 "title": "Job title or role to search for",190 "location": "Geographic location for job search",191 "companyName": ["array of specific companies mentioned"],192 "companyId": ["array of LinkedIn company IDs if mentioned"],193 "workType": "Either:194 1 = On-Site195 2 = Remote196 3 = Hybrid",197 "experienceLevel": "A numeric value from 1-5 where:198 1 = Internship199 2 = Entry Level200 3 = Associate201 4 = Mid-Senior Level202 5 = Director",203 "contractType": "A single letter representing employment type:204 F = Full-Time205 P = Part-Time206 C = Contract207 T = Temporary208 I = Internship209 V = Volunteer",210 "publishedAt": "Time frame:211 r86400 = Last 24 hours212 r604800 = Last week213 r2592000 = Last month214 empty string = Any time",215 "rows": "Number of job listings to return (integer)"216 }217 ```218 219 For any parameters not explicitly mentioned in the query, use null.220 221 IMPORTANT: Your output must be a valid JSON object wrapped in ```json code blocks.222 """
src/llm_providers/factory.py
1import logging2from typing import Optional, Any3
4logger = logging.getLogger(__name__)5
6def create_llm_provider(provider_name: str, api_key: str, model: Optional[str] = None) -> Any:7 """8 Create and return an instance of the specified LLM provider.9 10 Args:11 provider_name: Name of the LLM provider ('openai', 'claude', or 'gemini')12 api_key: API key for the provider13 model: Optional specific model to use14 15 Returns:16 An instance of the appropriate LLM provider17 18 Raises:19 ValueError: If the provider is not supported20 """21 if provider_name.lower() == "openai":22 from src.llm_providers.openai_provider import OpenAIProvider23 return OpenAIProvider(api_key, model)24 elif provider_name.lower() == "claude":25 from src.llm_providers.claude_provider import ClaudeProvider26 return ClaudeProvider(api_key, model)27 elif provider_name.lower() == "gemini":28 from src.llm_providers.gemini_provider import GeminiProvider29 return GeminiProvider(api_key, model)30 else:31 raise ValueError(f"Unsupported LLM provider: {provider_name}")
src/llm_providers/gemini_provider.py
1import json2import logging3import re4import base645from typing import Dict, Any, Optional, List6
7import google.generativeai as genai8from src.llm_providers.base_provider import LLMProvider9
10logger = logging.getLogger(__name__)11
12class GeminiProvider(LLMProvider):13 """Implementation of LLM provider for Google Gemini"""14
15 def __init__(self, api_key: str, model: Optional[str] = None):16 """Initialize the Gemini provider"""17 super().__init__(api_key, model)18 genai.configure(api_key=api_key)19 self.model_name = model or "gemini-1.5-pro"20 self.model = genai.GenerativeModel(self.model_name)21
22 def supports_document_processing(self) -> bool:23 """Check if this provider/model supports direct document processing"""24 vision_capable_models = ["gemini-pro-vision", "gemini-1.5-pro", "gemini-1.5-flash"]25 return any(model_name in self.model_name for model_name in vision_capable_models)26
27 async def validate_api_key(self) -> bool:28 """Validate the API key by making a simple models call"""29 try:30 # Gemini doesn't have a dedicated validate endpoint, use a simple generation31 response = self.model.generate_content("Hello")32 return True33 except Exception as e:34 logger.error(f"Gemini API key validation failed: {str(e)}")35 return False36
37 async def process_cv(self, cv_data: str) -> Dict[str, Any]:38 """39 Process CV with Gemini40
41 Args:42 cv_data: CV content (could be base64 encoded file or text)43
44 Returns:45 Dictionary of extracted parameters46 """47 if self.supports_document_processing() and cv_data.startswith("data:"):48 return await self._process_cv_with_vision(cv_data)49 else:50 # Assume it's already text51 return await self._process_cv_text(cv_data)52
53 async def _process_cv_with_vision(self, cv_data: str) -> Dict[str, Any]:54 """Process CV using Gemini's vision capabilities"""55 try:56 # Extract the mime type and base64 data57 mime_type, encoded_data = cv_data.split(';base64,', 1)58 mime_type = mime_type.replace('data:', '')59
60 # Create a content parts list with prompt and image61 parts = [62 self._get_cv_prompt(),63 {"mime_type": mime_type, "data": base64.b64decode(encoded_data)}64 ]65
66 response = self.model.generate_content(67 parts,68 generation_config={69 "temperature": 0.170 }71 )72
73 # Extract JSON from response74 content = response.text75
76 # Try to parse as JSON directly77 try:78 return json.loads(content)79 except json.JSONDecodeError:80 # If direct parsing fails, look for JSON in code blocks81 json_match = re.search(r'```(?:json)?\s*(.*?)\s*```', content, re.DOTALL)82 if json_match:83 return json.loads(json_match.group(1))84
85 # If still no match, try to find anything that looks like JSON86 json_pattern = r'{.*}'87 json_match = re.search(json_pattern, content, re.DOTALL)88 if json_match:89 return json.loads(json_match.group(0))90
91 logger.error(f"Could not parse Gemini response as JSON: {content}")92 raise ValueError("Failed to parse Gemini response as JSON")93
94 except Exception as e:95 logger.error(f"Gemini vision processing failed: {str(e)}")96 raise97
98 async def _process_cv_text(self, cv_text: str) -> Dict[str, Any]:99 """Process CV text with Gemini"""100 try:101 response = self.model.generate_content(102 self._get_cv_prompt() + f"\n\nCV TEXT:\n{cv_text}",103 generation_config={104 "temperature": 0.1105 }106 )107
108 # Extract JSON from response109 content = response.text110
111 # Try to parse as JSON directly112 try:113 return json.loads(content)114 except json.JSONDecodeError:115 # If direct parsing fails, look for JSON in code blocks116 json_match = re.search(r'```(?:json)?\s*(.*?)\s*```', content, re.DOTALL)117 if json_match:118 return json.loads(json_match.group(1))119
120 # If still no match, try to find anything that looks like JSON121 json_pattern = r'{.*}'122 json_match = re.search(json_pattern, content, re.DOTALL)123 if json_match:124 return json.loads(json_match.group(0))125
126 logger.error(f"Could not parse Gemini response as JSON: {content}")127 raise ValueError("Failed to parse Gemini response as JSON")128
129 except Exception as e:130 logger.error(f"Gemini text processing failed: {str(e)}")131 raise132
133 async def process_prompt(self, prompt: str) -> Dict[str, Any]:134 """Process user prompt and extract job search parameters"""135 try:136 response = self.model.generate_content(137 self._get_prompt_extraction_prompt() + f"\n\nUSER QUERY:\n{prompt}",138 generation_config={139 "temperature": 0.1140 }141 )142
143 # Extract JSON from response144 content = response.text145
146 # Try to parse as JSON directly147 try:148 return json.loads(content)149 except json.JSONDecodeError:150 # If direct parsing fails, look for JSON in code blocks151 json_match = re.search(r'```(?:json)?\s*(.*?)\s*```', content, re.DOTALL)152 if json_match:153 return json.loads(json_match.group(1))154
155 # If still no match, try to find anything that looks like JSON156 json_pattern = r'{.*}'157 json_match = re.search(json_pattern, content, re.DOTALL)158 if json_match:159 return json.loads(json_match.group(0))160
161 logger.error(f"Could not parse Gemini response as JSON: {content}")162 raise ValueError("Failed to parse Gemini response as JSON")163
164 except Exception as e:165 logger.error(f"Gemini prompt processing failed: {str(e)}")166 raise167
168 def _get_cv_prompt(self) -> str:169 """Get the prompt for CV analysis"""170 return """171 Extract the following job search parameters from this CV/resume:172
173 Follow these steps:174 1. Identify the job title175 2. Determine the location176 3. Assess experience level (1-5)177 4. Identify work type preference (1-3)178 5. Determine contract type (FPCTIV)179 6. List key skills180
181 Return ONLY a JSON object with this format:182 {183 "title": "The most recent job title or professional role",184 "location": "Current or preferred location",185 "experienceLevel": "A numeric value from 1-5 where:186 1 = Internship187 2 = Entry Level188 3 = Associate189 4 = Mid-Senior Level190 5 = Director",191 "workType": "Either:192 1 = On-Site193 2 = Remote194 3 = Hybrid195 Based on any workstyle preferences found in the CV",196 "contractType": "A single letter representing employment type preference:197 F = Full-Time198 P = Part-Time199 C = Contract200 T = Temporary201 I = Internship202 V = Volunteer",203 "skills": ["list", "of", "key", "technical", "and", "soft", "skills"]204 }205
206 If a piece of information is not clearly stated in the CV, make a reasonable inference based on the available information. If inference is not possible, use null.207
208 IMPORTANT: Your output must be ONLY the JSON object with no additional text.209 """210
211 def _get_prompt_extraction_prompt(self) -> str:212 """Get the prompt for extracting parameters from user query"""213 return """214 Extract LinkedIn job search parameters from this query.215
216 Follow these steps:217 1. Identify job title or role218 2. Determine geographic location219 3. Note any specific companies mentioned220 4. Assess experience level (1-5)221 5. Identify work type (1-3)222 6. Determine contract type (FPCTIV)223 7. Identify time frame for job postings224
225 Return ONLY a JSON object with this format:226 {227 "title": "Job title or role to search for",228 "location": "Geographic location for job search",229 "companyName": ["array of specific companies mentioned"],230 "companyId": ["array of LinkedIn company IDs if mentioned"],231 "workType": "Either:232 1 = On-Site233 2 = Remote234 3 = Hybrid",235 "experienceLevel": "A numeric value from 1-5 where:236 1 = Internship237 2 = Entry Level238 3 = Associate239 4 = Mid-Senior Level240 5 = Director",241 "contractType": "A single letter representing employment type:242 F = Full-Time243 P = Part-Time244 C = Contract245 T = Temporary246 I = Internship247 V = Volunteer",248 "publishedAt": "Time frame:249 r86400 = Last 24 hours250 r604800 = Last week251 r2592000 = Last month252 empty string = Any time",253 "rows": "Number of job listings to return (integer)"254 }255
256 For any parameters not explicitly mentioned in the query, use null.257
258 IMPORTANT: Your output must be ONLY the JSON object with no additional text.259 """
src/llm_providers/openai_provider.py
1import json2import logging3import base644import re5from typing import Dict, Any, Optional, List6
7from openai import AsyncOpenAI8from src.llm_providers.base_provider import LLMProvider9
10logger = logging.getLogger(__name__)11
12class OpenAIProvider(LLMProvider):13 """Implementation of LLM provider for OpenAI"""14 15 def __init__(self, api_key: str, model: Optional[str] = None):16 """Initialize the OpenAI provider"""17 super().__init__(api_key, model)18 self.client = AsyncOpenAI(api_key=api_key)19 self.model = model or "gpt-4o" # Default to most capable model20 21 def supports_document_processing(self) -> bool:22 """Check if this provider/model supports direct document processing"""23 document_capable_models = ["gpt-4-vision", "gpt-4o"]24 return any(model_name in self.model for model_name in document_capable_models)25 26 async def validate_api_key(self) -> bool:27 """Validate the API key by making a simple models.list call"""28 try:29 await self.client.models.list()30 return True31 except Exception as e:32 logger.error(f"OpenAI API key validation failed: {str(e)}")33 return False34 35 async def process_cv(self, cv_data: str) -> Dict[str, Any]:36 """37 Process CV with OpenAI38 39 Args:40 cv_data: CV content (could be base64 encoded file or text)41 42 Returns:43 Dictionary of extracted parameters44 """45 if self.supports_document_processing() and cv_data.startswith("data:"):46 return await self._process_cv_with_vision(cv_data)47 else:48 # Assume it's already text49 return await self._process_cv_text(cv_data)50 51 async def _process_cv_with_vision(self, cv_data: str) -> Dict[str, Any]:52 """Process CV using OpenAI's vision capabilities"""53 try:54 response = await self.client.chat.completions.create(55 model=self.model,56 messages=[57 {"role": "system", "content": "Extract job search parameters from this CV/resume."},58 {"role": "user", "content": [59 {"type": "text", "text": self._get_cv_prompt()},60 {"type": "image_url", "image_url": {"url": cv_data}}61 ]}62 ],63 response_format={"type": "json_object"}64 )65 return json.loads(response.choices[0].message.content)66 except Exception as e:67 logger.error(f"OpenAI vision processing failed: {str(e)}")68 raise69 70 async def _process_cv_text(self, cv_text: str) -> Dict[str, Any]:71 """Process CV text with OpenAI"""72 try:73 response = await self.client.chat.completions.create(74 model=self.model,75 messages=[76 {"role": "system", "content": "Extract job search parameters from this CV/resume."},77 {"role": "user", "content": self._get_cv_prompt() + f"\n\nCV TEXT:\n{cv_text}"}78 ],79 response_format={"type": "json_object"}80 )81 return json.loads(response.choices[0].message.content)82 except Exception as e:83 logger.error(f"OpenAI text processing failed: {str(e)}")84 raise85 86 async def process_prompt(self, prompt: str) -> Dict[str, Any]:87 """Process user prompt and extract job search parameters"""88 try:89 response = await self.client.chat.completions.create(90 model=self.model,91 messages=[92 {"role": "system", "content": "Extract job search parameters from this query."},93 {"role": "user", "content": self._get_prompt_extraction_prompt() + f"\n\nUSER QUERY:\n{prompt}"}94 ],95 response_format={"type": "json_object"}96 )97 return json.loads(response.choices[0].message.content)98 except Exception as e:99 logger.error(f"OpenAI prompt processing failed: {str(e)}")100 raise101 102 def _get_cv_prompt(self) -> str:103 """Get the prompt for CV analysis"""104 return """105 Extract the following job search parameters from this CV/resume in JSON format:106
107 Required JSON format:108 {109 "title": "The most recent job title or professional role",110 "location": "Current or preferred location",111 "experienceLevel": "A numeric value from 1-5 where:112 1 = Internship113 2 = Entry Level114 3 = Associate115 4 = Mid-Senior Level116 5 = Director",117 "workType": "Either:118 1 = On-Site119 2 = Remote120 3 = Hybrid121 Based on any workstyle preferences found in the CV",122 "contractType": "A single letter representing employment type preference:123 F = Full-Time124 P = Part-Time125 C = Contract126 T = Temporary127 I = Internship128 V = Volunteer",129 "skills": ["list", "of", "key", "technical", "and", "soft", "skills"]130 }131
132 If a piece of information is not clearly stated in the CV, make a reasonable inference based on the available information. If inference is not possible, use null.133 """134 135 def _get_prompt_extraction_prompt(self) -> str:136 """Get the prompt for extracting parameters from user query"""137 return """138 Extract LinkedIn job search parameters from this query in JSON format:139
140 Required JSON format:141 {142 "title": "Job title or role to search for",143 "location": "Geographic location for job search",144 "companyName": ["array of specific companies mentioned"],145 "companyId": ["array of LinkedIn company IDs if mentioned"],146 "workType": "Either:147 1 = On-Site148 2 = Remote149 3 = Hybrid",150 "experienceLevel": "A numeric value from 1-5 where:151 1 = Internship152 2 = Entry Level153 3 = Associate154 4 = Mid-Senior Level155 5 = Director",156 "contractType": "A single letter representing employment type:157 F = Full-Time158 P = Part-Time159 C = Contract160 T = Temporary161 I = Internship162 V = Volunteer",163 "publishedAt": "Time frame:164 r86400 = Last 24 hours165 r604800 = Last week166 r2592000 = Last month167 empty string = Any time",168 "rows": "Number of job listings to return (integer)"169 }170
171 For any parameters not explicitly mentioned in the query, use null.172 """
src/llm_providers/__init__.py
1# LLM Providers package
src/__pycache__/cv_processor.cpython-312.pyc
Downloadsrc/__pycache__/main.cpython-312.pyc
Downloadsrc/__pycache__/parameter_handler.cpython-312.pyc
Downloadsrc/__pycache__/prompt_processor.cpython-312.pyc
Downloadsrc/__pycache__/__init__.cpython-312.pyc
Downloadsrc/__pycache__/__main__.cpython-312.pyc
Downloadexample/advanced-reddit-scraper/.actor/actor.json
{ "actorSpecification": 1, "name": "reddit-subreddit-scraper", "title": "Reddit Subreddit Scraper", "description": "Scrapes Reddit subreddits.", "version": "0.0", "buildTag": "latest", "meta": { "templateId": "python-beautifulsoup" }, "input": "./input_schema.json", "dockerfile": "./Dockerfile"}
example/advanced-reddit-scraper/.actor/Dockerfile
# First, specify the base Docker image.# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.# You can also use any other image from Docker Hub.FROM apify/actor-python:3.12
# Second, copy just requirements.txt into the Actor image,# since it should be the only file that affects the dependency install in the next step,# in order to speed up the buildCOPY requirements.txt ./
# Install the packages specified in requirements.txt,# Print the installed Python version, pip version# and all installed packages with their versions for debuggingRUN echo "Python version:" \ && python --version \ && echo "Pip version:" \ && pip --version \ && echo "Installing dependencies:" \ && pip install -r requirements.txt \ && echo "All installed Python packages:" \ && pip freeze
# Next, copy the remaining files and directories with the source code.# Since we do this after installing the dependencies, quick build will be really fast# for most source file changes.COPY . ./
# Use compileall to ensure the runnability of the Actor Python code.RUN python3 -m compileall -q .
# Specify how to launch the source code of your Actor.# By default, the "python3 -m src" command is runCMD ["python3", "-m", "src"]
example/advanced-reddit-scraper/.actor/input_schema.json
{ "title": "Advanced Reddit Scraper", "type": "object", "schemaVersion": 1, "properties": { "queries": { "title": "Start URLs or subreddits", "type": "array", "description": "Subreddits to scrape in the format of https://reddit.com/r/<subreddit>", "prefill": [ "https://reddit.com/r/AskReddit" ], "default": [ "https://reddit.com/r/AskReddit" ], "editor": "stringList" }, "postSort": { "title": "Sorting", "type": "string", "enum": [ "hot", "new", "top", "rising" ], "description": "Sorting of posts in the subreddit - (top, new, rising, hot) - If already given a sorted subreddit link (e.g.: https://reddit.com/r/eli5/top/?t=day) the sorting will be ignored", "default": "top" }, "topPeriod": { "title": "Top posts period", "type": "string", "enum": [ "hour", "day", "week", "month", "year", "all" ], "description": "Top posts period - (only works when sorting is top)", "default": "week" }, "limit": { "title": "Max Posts", "type": "integer", "description": "Maximum number of posts to scrape per URL (default: 10)", "default": 10 },
"commentSort": { "title": "Comment sorting", "description": "Sorting of comments in the post - (best, top, new, controversial, old, qa)", "type": "string", "enum": [ "best", "top", "new", "controversial", "old", "qa" ], "default": "top" }, "numComments": { "title": "Number of comments to scrape", "type": "integer", "description": "A few replies to comments is also returned", "default": 0 } }, "required": ["queries"]}
example/advanced-reddit-scraper/.git/COMMIT_EDITMSG
release
example/advanced-reddit-scraper/.git/config
[core] repositoryformatversion = 0 filemode = false bare = false logallrefupdates = true symlinks = false ignorecase = true[remote "origin"] url = https://github.com/deduble/advanced-reddit-scraper.git fetch = +refs/heads/*:refs/remotes/origin/*[branch "main"] remote = origin merge = refs/heads/main[gui] wmstate = normal geometry = 1322x693+228+228 254 315
example/advanced-reddit-scraper/.git/description
Unnamed repository; edit this file 'description' to name the repository.
example/advanced-reddit-scraper/.git/FETCH_HEAD
68034258495c18bfb133b925e51bbce4b07c2cf2 branch 'main' of https://github.com/deduble/advanced-reddit-scraper
example/advanced-reddit-scraper/.git/HEAD
ref: refs/heads/main
example/advanced-reddit-scraper/.git/index
Downloadexample/advanced-reddit-scraper/.git/ORIG_HEAD
d39c6a0e47a92ee8fc7f4baace6c8c2ef406bb45
example/advanced-reddit-scraper/src/cookies.json
[ { "domain": ".reddit.com", "hostOnly": false, "httpOnly": false, "name": "csrf_token", "path": "/", "sameSite": "strict", "secure": true, "session": true, "storeId": "0", "value": "eb3157bc50b012701ac2f7aab49fcc4c", "id": 1 }, { "domain": ".reddit.com", "expirationDate": 1757533137, "hostOnly": false, "httpOnly": false, "name": "csv", "path": "/", "sameSite": "no_restriction", "secure": true, "session": false, "storeId": "0", "value": "2", "id": 2 }, { "domain": ".reddit.com", "expirationDate": 1757533137, "hostOnly": false, "httpOnly": false, "name": "edgebucket", "path": "/", "sameSite": "unspecified", "secure": true, "session": false, "storeId": "0", "value": "lH2AY01VDhdJZrAOeK", "id": 3 }, { "domain": ".reddit.com", "expirationDate": 1761371475, "hostOnly": false, "httpOnly": false, "name": "loid", "path": "/", "sameSite": "no_restriction", "secure": true, "session": false, "storeId": "0", "value": "0000000000927nml2x.2.1606468634160.Z0FBQUFBQm1zN01jTVgwSUpobndnMVlHS2xfcGNXdk1SbXhpMjJtN0NNa2VCOFZBZ3Zlb3loSGFlZWtxWlNkdHk5cUxZVVZtNDdWQWl6M0xOdXhRc3FsWmVob0pfQXdjQjItZ1pkOHFmTWsxVVFQU194SjEwTi10MHI2ay1TU01EYjhDVjdpclUxVFg", "id": 4 }, { "domain": ".reddit.com", "expirationDate": 1759110222, "hostOnly": false, "httpOnly": false, "name": "pc", "path": "/", "sameSite": "unspecified", "secure": true, "session": false, "storeId": "0", "value": "81", "id": 5 }, { "domain": ".reddit.com", "expirationDate": 1757612826, "hostOnly": false, "httpOnly": true, "name": "reddit_session", "path": "/", "sameSite": "unspecified", "secure": true, "session": false, "storeId": "0", "value": "710093989689%2C2024-08-07T17%3A47%3A05%2C62b4116104fbf3597d47b0718c6986d009b6f8c6", "id": 6 }, { "domain": ".reddit.com", "hostOnly": false, "httpOnly": false, "name": "session_tracker", "path": "/", "sameSite": "no_restriction", "secure": true, "session": true, "storeId": "0", "value": "mdmqaqfjmphfropmga.0.1727922697221.Z0FBQUFBQm1fZ0lKOVhiTHFwazVhYXBQa0FSS2VUTllqd2ljRmhuNFozRHVnZmkxU1JOcmZBd1dteXRPSmJxS0x3S2s0YVE2VEVRaGk1M0JMei1TV1Q2RGN4STZ4aHhCWnJhSEtsRDZsdEZveFVxeUhnVjNrSFNjOFpJRmM0bEREdVZfR2UyYTdZM2U", "id": 7 }, { "domain": ".reddit.com", "expirationDate": 1759458549, "hostOnly": false, "httpOnly": false, "name": "t2_927nml2x_recentclicks3", "path": "/", "sameSite": "strict", "secure": false, "session": false, "storeId": "0", "value": "t3_pgyvok%2Ct3_1fsuzj4%2Ct3_1fk6551%2Ct3_eokkto%2Ct3_14x7ys7%2Ct3_17wo9ms%2Ct3_dpcb2z%2Ct3_16fac9r%2Ct3_analu0%2Ct3_142jsph", "id": 8 }, { "domain": ".reddit.com", "expirationDate": 1728008948.6718, "hostOnly": false, "httpOnly": true, "name": "token_v2", "path": "/", "sameSite": "unspecified", "secure": true, "session": false, "storeId": "0", "value": "eyJhbGciOiJSUzI1NiIsImtpZCI6IlNIQTI1NjpzS3dsMnlsV0VtMjVmcXhwTU40cWY4MXE2OWFFdWFyMnpLMUdhVGxjdWNZIiwidHlwIjoiSldUIn0.eyJzdWIiOiJ1c2VyIiwiZXhwIjoxNzI4MDA4OTQ4LjE0MDk0MywiaWF0IjoxNzI3OTIyNTQ4LjE0MDk0MiwianRpIjoiNE5wUE5zejMzWkhrWXI0cktxZU9hazJiY0tYMkRRIiwiY2lkIjoiMFItV0FNaHVvby1NeVEiLCJsaWQiOiJ0Ml85MjdubWwyeCIsImFpZCI6InQyXzkyN25tbDJ4IiwibGNhIjoxNjA2NDY4NjM0MTYwLCJzY3AiOiJlSnhra2RHT3REQUloZC1sMXo3Ql95cF9OaHRzY1lhc0xRYW9rM243RFZvY2s3MDdjTDRpSFA4bktJcUZMRTJ1QktHa0tXRUZXdE9VTmlMdjU4eTlPWkVGU3lGVFI4NDN5d29rYVVwUFVtTjVweWxSd1daa0xsZmFzVUtEQjZZcFZTNloyMEtQUzV2UTNJMUZ6MDZNcWx4V0h0VFlvM0pwYkdNSzJ4UGp6Y1pxUXlxdXk2bE1ZRmtvbjhXTGZ2eUctdFktZjdiZmhIWXdyS2dLRF9UT3VGeHdZX0hERkhiX25wcjBiRjJ3cUwzWGc5US0xLU4yN2JObW9kbTVfVnpQdnphU2NUbUc1aWZZdjd0LUNSMTQ1SG1aVVFjd1lnMF95ckFqNl9Ddk9vREtCUVdNSlloUEk1QXJsMl9fSmRpdVRmOGF0eWQtLUdiRVRXXzRyUm1vNXhMRW9VX2o2emNBQVBfX1hEX2U0dyIsInJjaWQiOiJBNjE2cG1hN0taX1R1SzRVOFJlQlJUaXVKV3VBZ3lUY2VRTUpyS01NRk93IiwiZmxvIjoyfQ.GX8N8AYcgK2DWqWPqiclkljcwEawb7GFRw6QMdL9C7lb5FS-_ofuZpR0bx77pgWjWJ9uOczItTUfZvjx9u4CgeS9dK3U8G1apuqUW9YWDrgxfQeFWNMPVd0IjDTEt6Sn8vrdWb5cjv_SsGzxHgtC2RjdDLQYfQu2ud-Qp_1sELlBDPHDfhgOPbuOpzuFz2NJ8ifj623r2a8XOgQi5UaAHEClgleVAdkN2bpMd1kUsYh0PmMZOpN2XqvgdwKJUuyce-9yAqhMLiIPneVJnaytpth0jeRkT5-Fyt-_CgsXYphTG9T9u8Q2Z5JwOrwiosBPEokbhjculNQ78QlUUlC7UA", "id": 9 } ]
example/advanced-reddit-scraper/src/main.py
1"""This module defines the main entry point for the Apify Actor.2
3Feel free to modify this file to suit your specific needs.4
5To build Apify Actors, utilize the Apify SDK toolkit, read more at the official documentation:6https://docs.apify.com/sdk/python7"""8
9import sys10import os11
12from httpx import AsyncClient13
14from apify import Actor, Request15from .redditor import Redditor16
17
18async def main() -> None:19 """Main entry point for the Apify Actor.20
21 This coroutine is executed using `asyncio.run()`, so it must remain an asynchronous function for proper execution.22 Asynchronous execution is required for communication with Apify platform, and it also enhances performance in23 the field of web scraping significantly.24 """25 async with Actor:26 # Retrieve the Actor input, and use default values if not provided.27 actor_input = await Actor.get_input() or {}28 queries = actor_input.get('queries', ['https://reddit.com/r/AskReddit'])29 limit = actor_input.get('limit', 10)30 num_comments = actor_input.get('numComments', 0)31 sorting = actor_input.get('postSort', 'top')32 comment_sort = actor_input.get('commentSort', 'top')33 sorting_period = actor_input.get('topPeriod', 'today')34 if sorting_period not in {'hour', 'day', 'week', 'month', 'year', 'all'}:35 raise ValueError('topPeriod must be one of hour, day, week, month, year, all')36 if sorting not in {'hot', 'new', 'top', 'rising'}:37 raise ValueError('postSort must be one of hot, new, top, rising')38 if comment_sort not in {'best', 'top', 'new','controversial', 'old', 'qa'}:39 raise ValueError('commentSort must be one of hot, new, top, best, top, new,controversial, old, qa')40 reddit_scraper = Redditor(logger=Actor.log)41
42 # Exit if no start URLs are provided.43 if not queries:44 Actor.log.info('No queries specified in Actor input, exiting...')45 await Actor.exit()46
47 # Open the default request queue for handling URLs to be processed.48 request_queue = await Actor.open_request_queue()49
50 # Enqueue the start URLs with an initial crawl depth of 0.51 for query in queries:52 url = reddit_scraper.subreddit_link_from_query(query, sorting=sorting, period=sorting_period)53 Actor.log.info(f'Enqueuing {url} ...')54 request = Request.from_url(url, user_data={'limit': limit, 'numComments': num_comments, 'query': query})55 await request_queue.add_request(request)56
57 # Process the URLs from the request queue.58 while request := await request_queue.fetch_next_request():59 url = request.url60 query = request.user_data['query']61 posts_limit = request.user_data['limit']62 num_comments = request.user_data['numComments']63 Actor.log.info(f'Scraping {request.url} ...')64
65 try:66 # Fetch the HTTP response from the specified URL using HTTPX.67 async with AsyncClient() as client:68 # response = await client.get(url, follow_redirects=True)69 for post in reddit_scraper.get_all_posts(url, posts_limit=posts_limit, comments_limit=num_comments):70 await Actor.push_data(post)71
72 except Exception:73 Actor.log.exception(f'Failed to scrape {url}. Will be tried again.')74 try:75 Actor.log.exception(f"Data failed to be pushed is {post}")76 except:77 pass78 finally:79 # Mark the request as handled to ensure it is not processed again.80 await request_queue.mark_request_as_handled(request)
example/advanced-reddit-scraper/src/redditor.py
1import requests2from bs4 import BeautifulSoup3from typing import Dict, Any, List, Optional4import re5import base646import urllib.parse7from urllib.parse import urlparse, urlunparse, parse_qs, urlencode8from .session import cookies, headers9from typing import Dict, Any, List, Optional, Tuple10
11import time12import functools13
14def log_execution_time(func):15 @functools.wraps(func)16 def wrapper(self, *args, **kwargs):17 start_time = time.time()18 result = func(self, *args, **kwargs)19 end_time = time.time()20 duration = end_time - start_time21 self.logger.debug(f"{func.__name__} took {duration:.2f} seconds to execute")22 return result23 return wrapper24
25class Redditor:26 BASE_URL = "https://www.reddit.com"27
28 def __init__(self, logger):29 self.logger = logger30 self.cookies = cookies31 self.headers = headers32 self.session = requests.Session()33 self.session.cookies.update(cookies)34 self.session.headers.update(headers)35
36 @log_execution_time37 def get_community_posts(self, url: str, after: Optional[str] = None) -> str:38 try:39 parsed_query = self.parse_url(url)40 sort = parsed_query.get('sort', 'top')41 url = f"{self.BASE_URL}/svc/shreddit/community-more-posts/{sort}/"42 params = {43 "after": after,44 "t": parsed_query['time'] or 'day',45 "name": parsed_query['sub'],46 "navigationSessionId": "a10adc86-f1ec-4221-9179-d9613e4c7d05",47 "feedLength": "28"48 }49
50 response = self.session.get(url, params=params)51 response.raise_for_status()52
53 return response.text54 except requests.exceptions.RequestException as e:55 self.logger.error(f"Error fetching community posts: {e}")56 raise57 except Exception as e:58 self.logger.error(f"Unexpected error: {e}")59 raise60
61 @log_execution_time62 def get_post_content(self, permalink: str) -> str:63 """Get the content of a post using its permalink"""64 try:65 url = f"{self.BASE_URL}{permalink}"66 response = self.session.get(url)67 response.raise_for_status()68 soup = BeautifulSoup(response.text, 'html.parser')69 70 # Find the post content in the text-body slot71 text_body = soup.find('div', {'slot': 'text-body'})72 if text_body:73 md_div = text_body.find('div', {'class': 'md'})74 if md_div:75 paragraphs = md_div.find_all('p')76 return '\n'.join(p.get_text(strip=True) for p in paragraphs)77 78 # If no text content, check for media content79 shreddit_post = soup.find('shreddit-post')80 if shreddit_post:81 content_href = shreddit_post.get('content-href')82 if content_href:83 return content_href84 85 return ''86 except Exception as e:87 self.logger.error(f"Error getting post content: {e}")88 return ''89
90 @log_execution_time91 def parse_posts(self, html_content: str) -> List[Dict[str, Any]]:92 try:93 soup = BeautifulSoup(html_content, 'html.parser')94 posts = []95 for article in soup.find_all('article'):96 shreddit_post = article.find('shreddit-post')97 if shreddit_post:98 permalink = shreddit_post.get('permalink')99 post = {100 "id": shreddit_post.get('id'),101 "title": shreddit_post.get('post-title'),102 "author": shreddit_post.get('author'),103 "subreddit": shreddit_post.get('subreddit-prefixed-name'),104 "score": shreddit_post.get('score'),105 "num_comments": shreddit_post.get('comment-count'),106 "created_timestamp": shreddit_post.get('created-timestamp'),107 "permalink": permalink,108 "content": self.get_post_content(permalink)109 }110 posts.append(post)111 return posts112 except Exception as e:113 self.logger.error(f"Error parsing posts: {e}")114 raise115 @log_execution_time116 def get_next_cursor(self, html_content: str) -> Optional[str]:117 try:118 soup = BeautifulSoup(html_content, 'html.parser')119 load_after = soup.find('faceplate-partial', slot='load-after')120
121 if load_after:122 src = load_after.get('src', '')123 match = re.search(r'after=([^&]+)', src)124 if match:125 encoded_cursor = match.group(1)126 decoded_cursor = urllib.parse.unquote(encoded_cursor)127 padding = '=' * ((4 - len(decoded_cursor) % 4) % 4)128 padded_cursor = decoded_cursor + padding129 return base64.b64decode(padded_cursor).decode('utf-8')130
131 except Exception as e:132 self.logger.error(f"Error retrieving next cursor: {e}")133 return None134 @log_execution_time135 def get_all_posts(self, subreddit: str, posts_limit: int = 100, comments_limit: int = 0) -> List[Dict[str, Any]]:136 all_posts = []137 after = None138
139 try:140 while len(all_posts) < posts_limit:141 self.logger.info(f"Fetching posts for subreddit {subreddit}...")142 html_content = self.get_community_posts(subreddit, after)143 new_posts = self.parse_posts(html_content)[:posts_limit - len(all_posts)]144
145 if not new_posts:146 break147
148 for post in new_posts:149 if comments_limit > 0:150 post['comments'] = self.get_all_comments(post['subreddit'].split('/')[1], post['id'], comments_limit)151
152 all_posts.extend(new_posts)153 after = self.get_next_cursor(html_content)154
155 if not after:156 break157 158 self.logger.info(f"Retrieved {len(all_posts[:posts_limit])} posts.")159 return all_posts[:posts_limit]160 except Exception as e:161 self.logger.error(f"Error retrieving posts: {e}")162 raise163
164 @log_execution_time165 def parse_url(self, url: str) -> Dict[str, str]:166 result = {'sub': '', 'sort': 'none', 'time': None}167
168 try:169 subreddit_pattern = re.compile(r'(?:/r/|reddit\.com/r/|^)(\w+)')170 sort_pattern = re.compile(r'/(hot|new|top|rising)')171 time_pattern = re.compile(r'[?&]t=(hour|day|week|month|year|all)')172
173 if not url.startswith('http'):174 match = subreddit_pattern.search(url)175 if match:176 result['sub'] = match.group(1)177 return result178
179 path = urlparse(url).path180 query_string = urlparse(url).query181
182 sub_match = subreddit_pattern.search(path)183 if sub_match:184 result['sub'] = sub_match.group(1)185
186 sort_match = sort_pattern.search(path)187 if sort_match:188 result['sort'] = sort_match.group(1)189
190 time_match = time_pattern.search(query_string)191 if time_match:192 result['time'] = time_match.group(1)193
194 return result195 except Exception as e:196 self.logger.error(f"Error parsing URL: {e}")197 raise198
199 @log_execution_time200 def get_comments(self, subreddit: str, post_id: str, cursor: Optional[str] = None, sort: str = 'hot') -> Tuple[List[Dict[str, Any]], Optional[str]]:201 try:202 url = f"{self.BASE_URL}/svc/shreddit/more-comments/{subreddit}/t3_{post_id.split('_')[1]}"203 params = {'sort': sort, 'top-level': '1'}204 data = {}205
206 if cursor:207 params['cursor'] = cursor208
209 response = self.session.post(url, params=params, data=data)210 response.raise_for_status()211
212 return self.parse_comments(response.text)213 except requests.exceptions.RequestException as e:214 self.logger.error(f"Error fetching comments: {e}")215 raise216 except Exception as e:217 self.logger.error(f"Unexpected error: {e}")218 raise219
220 @log_execution_time221 def parse_comments(self, html_content: str) -> Tuple[List[Dict[str, Any]], Optional[str]]:222 try:223 soup = BeautifulSoup(html_content, 'html.parser')224 comments = []225
226 for comment in soup.find_all('shreddit-comment'):227 content_div = comment.find('div', {'class': 'md'})228 # Extract clean comment text if content div exists229 if content_div:230 # Get all paragraphs from the content231 paragraphs = content_div.find_all('p')232 # Join paragraphs with newlines, strip whitespace233 content = '\n'.join(p.get_text(strip=True) for p in paragraphs)234 else:235 content = ''236 parsed_comment = {237 "id": comment.get('thingid'),238 "author": comment.get('author'),239 "score": comment.get('score'),240 "depth": comment.get('depth'),241 "permalink": comment.get('permalink'),242 "content": content.strip()243 }244 comments.append(parsed_comment)245
246 next_cursor = self.get_next_comment_cursor(html_content)247 return comments, next_cursor248 except Exception as e:249 self.logger.error(f"Error parsing comments: {e}")250 raise251
252 @log_execution_time253 def get_next_comment_cursor(self, html_content: str) -> Optional[str]:254 try:255 soup = BeautifulSoup(html_content, 'html.parser')256 faceplate_partial = soup.find('faceplate-partial', attrs={'loading': 'action'})257
258 if faceplate_partial:259 hidden_input = faceplate_partial.find('input', attrs={'type': 'hidden', 'name': 'cursor'})260 if hidden_input:261 return hidden_input.get('value')262
263 except Exception as e:264 self.logger.error(f"Error retrieving next comment cursor: {e}")265 return None266
267 @log_execution_time268 def get_all_comments(self, subreddit: str, post_id: str, limit: int = 100) -> List[Dict[str, Any]]:269 all_comments = []270 cursor = None271
272 try:273 while len(all_comments) < limit:274 comments, next_cursor = self.get_comments(subreddit, post_id, cursor)275 all_comments.extend(comments)276
277 if not next_cursor:278 self.logger.info(f"Next cursor not found for post {post_id}.")279 break280
281 cursor = next_cursor282 self.logger.info(f"Retrieved {len(all_comments)} comments.")283 return all_comments[:limit]284 except Exception as e:285 self.logger.error(f"Error retrieving comments: {e}")286 raise287
288 @log_execution_time289 def subreddit_link_from_query(self, query, sorting='top', period='week'):290 try:291 # If the input is just a subreddit name (with or without 'r/')292 if not query.startswith('http'):293 # Normalize input to the form 'r/subredditname'294 if query.startswith('r/'):295 query = f'https://www.reddit.com/{query}/'296 else:297 query = f'https://www.reddit.com/r/{query}/'298
299 # Parse the subreddit link300 parsed_url = urlparse(query)301
302 # Ensure that the path ends with a trailing slash303 path_parts = parsed_url.path.rstrip('/').split('/')304
305 # Valid sorting options306 valid_sorting = ['hot', 'new', 'rising', 'top']307
308 # Check if the link is already sorted309 if len(path_parts) > 3 and path_parts[3] in valid_sorting:310 # Return the original link if already sorted311 return query312
313 # Otherwise, append the sorting method to the path314 path_parts.append(sorting)315
316 # Add the 't' parameter only if sorting is 'top'317 query_params = parse_qs(parsed_url.query)318 if sorting == 'top':319 query_params['t'] = [period]320
321 # Rebuild the URL322 new_path = '/'.join(path_parts) + '/'323 new_query = urlencode(query_params, doseq=True)324
325 # Return the new URL326 return urlunparse((parsed_url.scheme, parsed_url.netloc, new_path, parsed_url.params, new_query, parsed_url.fragment))327 328 except Exception as e:329 self.logger.error(f"Error constructing subreddit URL from query: {e}")330 raise
example/advanced-reddit-scraper/src/session.py
1cookies = {2 'csv': '2',3 'edgebucket': 'lH2AY01VDhdJZrAOeK',4 'loid': '0000000000927nml2x.2.1606468634160.Z0FBQUFBQm1zN01jTVgwSUpobndnMVlHS2xfcGNXdk1SbXhpMjJtN0NNa2VCOFZBZ3Zlb3loSGFlZWtxWlNkdHk5cUxZVVZtNDdWQWl6M0xOdXhRc3FsWmVob0pfQXdjQjItZ1pkOHFmTWsxVVFQU194SjEwTi10MHI2ay1TU01EYjhDVjdpclUxVFg',5 'pc': '81',6 'reddit_session': '710093989689%2C2024-08-07T17%3A47%3A05%2C62b4116104fbf3597d47b0718c6986d009b6f8c6',7 'token_v2': 'eyJhbGciOiJSUzI1NiIsImtpZCI6IlNIQTI1NjpzS3dsMnlsV0VtMjVmcXhwTU40cWY4MXE2OWFFdWFyMnpLMUdhVGxjdWNZIiwidHlwIjoiSldUIn0.eyJzdWIiOiJ1c2VyIiwiZXhwIjoxNzI5MTk4MTQ0LjY5NDYyMSwiaWF0IjoxNzI5MTExNzQ0LjY5NDYyMSwianRpIjoiZXBsV3k0R1VURHl4aGwtdEhWZnI2U0lxY00xR0lnIiwiY2lkIjoiMFItV0FNaHVvby1NeVEiLCJsaWQiOiJ0Ml85MjdubWwyeCIsImFpZCI6InQyXzkyN25tbDJ4IiwibGNhIjoxNjA2NDY4NjM0MTYwLCJzY3AiOiJlSnhra2RHT3REQUloZC1sMXo3Ql95cF9OaHRzY1lhc0xRYW9rM243RFZvY2s3MDdjTDRpSFA4bktJcUZMRTJ1QktHa0tXRUZXdE9VTmlMdjU4eTlPWkVGU3lGVFI4NDN5d29rYVVwUFVtTjVweWxSd1daa0xsZmFzVUtEQjZZcFZTNloyMEtQUzV2UTNJMUZ6MDZNcWx4V0h0VFlvM0pwYkdNSzJ4UGp6Y1pxUXlxdXk2bE1ZRmtvbjhXTGZ2eUctdFktZjdiZmhIWXdyS2dLRF9UT3VGeHdZX0hERkhiX25wcjBiRjJ3cUwzWGc5US0xLU4yN2JObW9kbTVfVnpQdnphU2NUbUc1aWZZdjd0LUNSMTQ1SG1aVVFjd1lnMF95ckFqNl9Ddk9vREtCUVdNSlloUEk1QXJsMl9fSmRpdVRmOGF0eWQtLUdiRVRXXzRyUm1vNXhMRW9VX2o2emNBQVBfX1hEX2U0dyIsInJjaWQiOiJBNjE2cG1hN0taX1R1SzRVOFJlQlJUaXVKV3VBZ3lUY2VRTUpyS01NRk93IiwiZmxvIjoyfQ.FBxK7Xnbhy-bW3l71YopqqUBpjfkOdz8XBUauNi3o3pScQLvO0sOs72E2kMiaYX6iTfUPyklR5xRnGVF6PjQmurx2vu8XAm3W1IkGIYPZOOvjnWKhbzv1m8bzfOHGSIZg9bOy7RoCce6A-HCKfR6y4nQyMaiv5jCUdLILePHdUYw3kZEC_ASAXEXvv-dyyaO2GCW_Jxq95CU6lYxqLaO73xhPzR9YjNl_RaAC9xMip6d5Xe3n5wuMdY8bQ3dAfqNVNJKI4fkIij0v90-SJT7vKffNSbueqrckCPgDIXQrpJA1_bx-npHLl5gg7-uBLwDUzXpWMO_BTDxgekscFc6fQ',8 'reddit_chat_view': 'closed',9 't2_927nml2x_recentclicks3': 't3_1g57do3%2Ct3_e7ewoo%2Ct3_1fvuu0l%2Ct3_435p6x%2Ct3_d956ag%2Ct3_15svnqa%2Ct3_f2nxzt%2Ct3_e6ryal%2Ct3_79uq5s%2Ct3_7qry4j',10 'csrf_token': 'd7886d7dde33b8ae9f535d8cf19dad8f',11 'session_tracker': 'mifofnihaddjdlkjml.0.1729129739819.Z0FBQUFBQm5FRzBMWUZrSlZycUctVmcwZ25zZm9ZRTV4T1NMNjdQTW45dTI1eFQ1NDVqTWF2N20yQzlXNVFCUkEyNndKazVCbWJ1ZHFoVlFZMEFPS2xGYXpDY2Fxcm4xX1F6UEZfWFpfal92NTVuRDF6Q0EzTWtOT3lZOENQQUVBaFlScWQwMGpqZFk',12}13
14headers = {15 'accept': 'text/vnd.reddit.partial+html, text/html;q=0.9',16 'accept-language': 'en,en-US;q=0.9,tr-TR;q=0.8,tr;q=0.7,de;q=0.6',17 'content-type': 'application/x-www-form-urlencoded',18 # 'cookie': 'csv=2; edgebucket=lH2AY01VDhdJZrAOeK; loid=0000000000927nml2x.2.1606468634160.Z0FBQUFBQm1zN01jTVgwSUpobndnMVlHS2xfcGNXdk1SbXhpMjJtN0NNa2VCOFZBZ3Zlb3loSGFlZWtxWlNkdHk5cUxZVVZtNDdWQWl6M0xOdXhRc3FsWmVob0pfQXdjQjItZ1pkOHFmTWsxVVFQU194SjEwTi10MHI2ay1TU01EYjhDVjdpclUxVFg; pc=81; reddit_session=710093989689%2C2024-08-07T17%3A47%3A05%2C62b4116104fbf3597d47b0718c6986d009b6f8c6; token_v2=eyJhbGciOiJSUzI1NiIsImtpZCI6IlNIQTI1NjpzS3dsMnlsV0VtMjVmcXhwTU40cWY4MXE2OWFFdWFyMnpLMUdhVGxjdWNZIiwidHlwIjoiSldUIn0.eyJzdWIiOiJ1c2VyIiwiZXhwIjoxNzI5MTk4MTQ0LjY5NDYyMSwiaWF0IjoxNzI5MTExNzQ0LjY5NDYyMSwianRpIjoiZXBsV3k0R1VURHl4aGwtdEhWZnI2U0lxY00xR0lnIiwiY2lkIjoiMFItV0FNaHVvby1NeVEiLCJsaWQiOiJ0Ml85MjdubWwyeCIsImFpZCI6InQyXzkyN25tbDJ4IiwibGNhIjoxNjA2NDY4NjM0MTYwLCJzY3AiOiJlSnhra2RHT3REQUloZC1sMXo3Ql95cF9OaHRzY1lhc0xRYW9rM243RFZvY2s3MDdjTDRpSFA4bktJcUZMRTJ1QktHa0tXRUZXdE9VTmlMdjU4eTlPWkVGU3lGVFI4NDN5d29rYVVwUFVtTjVweWxSd1daa0xsZmFzVUtEQjZZcFZTNloyMEtQUzV2UTNJMUZ6MDZNcWx4V0h0VFlvM0pwYkdNSzJ4UGp6Y1pxUXlxdXk2bE1ZRmtvbjhXTGZ2eUctdFktZjdiZmhIWXdyS2dLRF9UT3VGeHdZX0hERkhiX25wcjBiRjJ3cUwzWGc5US0xLU4yN2JObW9kbTVfVnpQdnphU2NUbUc1aWZZdjd0LUNSMTQ1SG1aVVFjd1lnMF95ckFqNl9Ddk9vREtCUVdNSlloUEk1QXJsMl9fSmRpdVRmOGF0eWQtLUdiRVRXXzRyUm1vNXhMRW9VX2o2emNBQVBfX1hEX2U0dyIsInJjaWQiOiJBNjE2cG1hN0taX1R1SzRVOFJlQlJUaXVKV3VBZ3lUY2VRTUpyS01NRk93IiwiZmxvIjoyfQ.FBxK7Xnbhy-bW3l71YopqqUBpjfkOdz8XBUauNi3o3pScQLvO0sOs72E2kMiaYX6iTfUPyklR5xRnGVF6PjQmurx2vu8XAm3W1IkGIYPZOOvjnWKhbzv1m8bzfOHGSIZg9bOy7RoCce6A-HCKfR6y4nQyMaiv5jCUdLILePHdUYw3kZEC_ASAXEXvv-dyyaO2GCW_Jxq95CU6lYxqLaO73xhPzR9YjNl_RaAC9xMip6d5Xe3n5wuMdY8bQ3dAfqNVNJKI4fkIij0v90-SJT7vKffNSbueqrckCPgDIXQrpJA1_bx-npHLl5gg7-uBLwDUzXpWMO_BTDxgekscFc6fQ; reddit_chat_view=closed; t2_927nml2x_recentclicks3=t3_1g57do3%2Ct3_e7ewoo%2Ct3_1fvuu0l%2Ct3_435p6x%2Ct3_d956ag%2Ct3_15svnqa%2Ct3_f2nxzt%2Ct3_e6ryal%2Ct3_79uq5s%2Ct3_7qry4j; csrf_token=d7886d7dde33b8ae9f535d8cf19dad8f; session_tracker=mifofnihaddjdlkjml.0.1729129739819.Z0FBQUFBQm5FRzBMWUZrSlZycUctVmcwZ25zZm9ZRTV4T1NMNjdQTW45dTI1eFQ1NDVqTWF2N20yQzlXNVFCUkEyNndKazVCbWJ1ZHFoVlFZMEFPS2xGYXpDY2Fxcm4xX1F6UEZfWFpfal92NTVuRDF6Q0EzTWtOT3lZOENQQUVBaFlScWQwMGpqZFk',19 'origin': 'https://www.reddit.com',20 'priority': 'u=1, i',21 'referer': 'https://www.reddit.com/r/AskReddit/comments/1g57do3/whats_a_bitter_life_lesson_you_learned_from_your/',22 'sec-ch-ua': '"Google Chrome";v="129", "Not=A?Brand";v="8", "Chromium";v="129"',23 'sec-ch-ua-mobile': '?0',24 'sec-ch-ua-platform': '"Windows"',25 'sec-fetch-dest': 'empty',26 'sec-fetch-mode': 'cors',27 'sec-fetch-site': 'same-origin',28 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36',29}
example/advanced-reddit-scraper/src/__main__.py
1import asyncio2
3from .main import main4
5# Execute the Actor entrypoint.6asyncio.run(main())
src/llm_providers/__pycache__/base_provider.cpython-312.pyc
Downloadsrc/llm_providers/__pycache__/factory.cpython-312.pyc
Downloadsrc/llm_providers/__pycache__/gemini_provider.cpython-312.pyc
Downloadsrc/llm_providers/__pycache__/__init__.cpython-312.pyc
Downloadexample/advanced-reddit-scraper/.git/info/exclude
# git ls-files --others --exclude-from=.git/info/exclude# Lines that start with '#' are comments.# For a project mostly in C, the following would be a good set of# exclude patterns (uncomment them if you want to use them):# *.[oa]# *~
example/advanced-reddit-scraper/.git/hooks/applypatch-msg.sample
#!/bin/sh## An example hook script to check the commit log message taken by# applypatch from an e-mail message.## The hook should exit with non-zero status after issuing an# appropriate message if it wants to stop the commit. The hook is# allowed to edit the commit message file.## To enable this hook, rename this file to "applypatch-msg".
. git-sh-setupcommitmsg="$(git rev-parse --git-path hooks/commit-msg)"test -x "$commitmsg" && exec "$commitmsg" ${1+"$@"}:
example/advanced-reddit-scraper/.git/hooks/commit-msg.sample
#!/bin/sh## An example hook script to check the commit log message.# Called by "git commit" with one argument, the name of the file# that has the commit message. The hook should exit with non-zero# status after issuing an appropriate message if it wants to stop the# commit. The hook is allowed to edit the commit message file.## To enable this hook, rename this file to "commit-msg".
# Uncomment the below to add a Signed-off-by line to the message.# Doing this in a hook is a bad idea in general, but the prepare-commit-msg# hook is more suited to it.## SOB=$(git var GIT_AUTHOR_IDENT | sed -n 's/^\(.*>\).*$/Signed-off-by: \1/p')# grep -qs "^$SOB" "$1" || echo "$SOB" >> "$1"
# This example catches duplicate Signed-off-by lines.
test "" = "$(grep '^Signed-off-by: ' "$1" | sort | uniq -c | sed -e '/^[ ]*1[ ]/d')" || { echo >&2 Duplicate Signed-off-by lines. exit 1}
example/advanced-reddit-scraper/.git/hooks/fsmonitor-watchman.sample
#!/usr/bin/perl
use strict;use warnings;use IPC::Open2;
# An example hook script to integrate Watchman# (https://facebook.github.io/watchman/) with git to speed up detecting# new and modified files.## The hook is passed a version (currently 2) and last update token# formatted as a string and outputs to stdout a new update token and# all files that have been modified since the update token. Paths must# be relative to the root of the working tree and separated by a single NUL.## To enable this hook, rename this file to "query-watchman" and set# 'git config core.fsmonitor .git/hooks/query-watchman'#my ($version, $last_update_token) = @ARGV;
# Uncomment for debugging# print STDERR "$0 $version $last_update_token\n";
# Check the hook interface versionif ($version ne 2) { die "Unsupported query-fsmonitor hook version '$version'.\n" . "Falling back to scanning...\n";}
my $git_work_tree = get_working_dir();
my $retry = 1;
my $json_pkg;eval { require JSON::XS; $json_pkg = "JSON::XS"; 1;} or do { require JSON::PP; $json_pkg = "JSON::PP";};
launch_watchman();
sub launch_watchman { my $o = watchman_query(); if (is_work_tree_watched($o)) { output_result($o->{clock}, @{$o->{files}}); }}
sub output_result { my ($clockid, @files) = @_;
# Uncomment for debugging watchman output # open (my $fh, ">", ".git/watchman-output.out"); # binmode $fh, ":utf8"; # print $fh "$clockid\n@files\n"; # close $fh;
binmode STDOUT, ":utf8"; print $clockid; print "\0"; local $, = "\0"; print @files;}
sub watchman_clock { my $response = qx/watchman clock "$git_work_tree"/; die "Failed to get clock id on '$git_work_tree'.\n" . "Falling back to scanning...\n" if $? != 0;
return $json_pkg->new->utf8->decode($response);}
sub watchman_query { my $pid = open2(\*CHLD_OUT, \*CHLD_IN, 'watchman -j --no-pretty') or die "open2() failed: $!\n" . "Falling back to scanning...\n";
# In the query expression below we're asking for names of files that # changed since $last_update_token but not from the .git folder. # # To accomplish this, we're using the "since" generator to use the # recency index to select candidate nodes and "fields" to limit the # output to file names only. Then we're using the "expression" term to # further constrain the results. my $last_update_line = ""; if (substr($last_update_token, 0, 1) eq "c") { $last_update_token = "\"$last_update_token\""; $last_update_line = qq[\n"since": $last_update_token,]; } my $query = <<" END"; ["query", "$git_work_tree", {$last_update_line "fields": ["name"], "expression": ["not", ["dirname", ".git"]] }] END
# Uncomment for debugging the watchman query # open (my $fh, ">", ".git/watchman-query.json"); # print $fh $query; # close $fh;
print CHLD_IN $query; close CHLD_IN; my $response = do {local $/; <CHLD_OUT>};
# Uncomment for debugging the watch response # open ($fh, ">", ".git/watchman-response.json"); # print $fh $response; # close $fh;
die "Watchman: command returned no output.\n" . "Falling back to scanning...\n" if $response eq ""; die "Watchman: command returned invalid output: $response\n" . "Falling back to scanning...\n" unless $response =~ /^\{/;
return $json_pkg->new->utf8->decode($response);}
sub is_work_tree_watched { my ($output) = @_; my $error = $output->{error}; if ($retry > 0 and $error and $error =~ m/unable to resolve root .* directory (.*) is not watched/) { $retry--; my $response = qx/watchman watch "$git_work_tree"/; die "Failed to make watchman watch '$git_work_tree'.\n" . "Falling back to scanning...\n" if $? != 0; $output = $json_pkg->new->utf8->decode($response); $error = $output->{error}; die "Watchman: $error.\n" . "Falling back to scanning...\n" if $error;
# Uncomment for debugging watchman output # open (my $fh, ">", ".git/watchman-output.out"); # close $fh;
# Watchman will always return all files on the first query so # return the fast "everything is dirty" flag to git and do the # Watchman query just to get it over with now so we won't pay # the cost in git to look up each individual file. my $o = watchman_clock(); $error = $output->{error};
die "Watchman: $error.\n" . "Falling back to scanning...\n" if $error;
output_result($o->{clock}, ("/")); $last_update_token = $o->{clock};
eval { launch_watchman() }; return 0; }
die "Watchman: $error.\n" . "Falling back to scanning...\n" if $error;
return 1;}
sub get_working_dir { my $working_dir; if ($^O =~ 'msys' || $^O =~ 'cygwin') { $working_dir = Win32::GetCwd(); $working_dir =~ tr/\\/\//; } else { require Cwd; $working_dir = Cwd::cwd(); }
return $working_dir;}
example/advanced-reddit-scraper/.git/hooks/post-update.sample
#!/bin/sh## An example hook script to prepare a packed repository for use over# dumb transports.## To enable this hook, rename this file to "post-update".
exec git update-server-info
example/advanced-reddit-scraper/.git/hooks/pre-applypatch.sample
#!/bin/sh## An example hook script to verify what is about to be committed# by applypatch from an e-mail message.## The hook should exit with non-zero status after issuing an# appropriate message if it wants to stop the commit.## To enable this hook, rename this file to "pre-applypatch".
. git-sh-setupprecommit="$(git rev-parse --git-path hooks/pre-commit)"test -x "$precommit" && exec "$precommit" ${1+"$@"}:
example/advanced-reddit-scraper/.git/hooks/pre-commit.sample
#!/bin/sh## An example hook script to verify what is about to be committed.# Called by "git commit" with no arguments. The hook should# exit with non-zero status after issuing an appropriate message if# it wants to stop the commit.## To enable this hook, rename this file to "pre-commit".
if git rev-parse --verify HEAD >/dev/null 2>&1then against=HEADelse # Initial commit: diff against an empty tree object against=$(git hash-object -t tree /dev/null)fi
# If you want to allow non-ASCII filenames set this variable to true.allownonascii=$(git config --type=bool hooks.allownonascii)
# Redirect output to stderr.exec 1>&2
# Cross platform projects tend to avoid non-ASCII filenames; prevent# them from being added to the repository. We exploit the fact that the# printable range starts at the space character and ends with tilde.if [ "$allownonascii" != "true" ] && # Note that the use of brackets around a tr range is ok here, (it's # even required, for portability to Solaris 10's /usr/bin/tr), since # the square bracket bytes happen to fall in the designated range. test $(git diff-index --cached --name-only --diff-filter=A -z $against | LC_ALL=C tr -d '[ -~]\0' | wc -c) != 0then cat <<\EOFError: Attempt to add a non-ASCII file name.
This can cause problems if you want to work with people on other platforms.
To be portable it is advisable to rename the file.
If you know what you are doing you can disable this check using:
git config hooks.allownonascii trueEOF exit 1fi
# If there are whitespace errors, print the offending file names and fail.exec git diff-index --check --cached $against --
example/advanced-reddit-scraper/.git/hooks/pre-merge-commit.sample
#!/bin/sh## An example hook script to verify what is about to be committed.# Called by "git merge" with no arguments. The hook should# exit with non-zero status after issuing an appropriate message to# stderr if it wants to stop the merge commit.## To enable this hook, rename this file to "pre-merge-commit".
. git-sh-setuptest -x "$GIT_DIR/hooks/pre-commit" && exec "$GIT_DIR/hooks/pre-commit":
example/advanced-reddit-scraper/.git/hooks/pre-push.sample
#!/bin/sh
# An example hook script to verify what is about to be pushed. Called by "git# push" after it has checked the remote status, but before anything has been# pushed. If this script exits with a non-zero status nothing will be pushed.## This hook is called with the following parameters:## $1 -- Name of the remote to which the push is being done# $2 -- URL to which the push is being done## If pushing without using a named remote those arguments will be equal.## Information about the commits which are being pushed is supplied as lines to# the standard input in the form:## <local ref> <local oid> <remote ref> <remote oid>## This sample shows how to prevent push of commits where the log message starts# with "WIP" (work in progress).
remote="$1"url="$2"
zero=$(git hash-object --stdin </dev/null | tr '[0-9a-f]' '0')
while read local_ref local_oid remote_ref remote_oiddo if test "$local_oid" = "$zero" then # Handle delete : else if test "$remote_oid" = "$zero" then # New branch, examine all commits range="$local_oid" else # Update to existing branch, examine new commits range="$remote_oid..$local_oid" fi
# Check for WIP commit commit=$(git rev-list -n 1 --grep '^WIP' "$range") if test -n "$commit" then echo >&2 "Found WIP commit in $local_ref, not pushing" exit 1 fi fidone
exit 0
example/advanced-reddit-scraper/.git/hooks/pre-rebase.sample
#!/bin/sh## Copyright (c) 2006, 2008 Junio C Hamano## The "pre-rebase" hook is run just before "git rebase" starts doing# its job, and can prevent the command from running by exiting with# non-zero status.## The hook is called with the following parameters:## $1 -- the upstream the series was forked from.# $2 -- the branch being rebased (or empty when rebasing the current branch).## This sample shows how to prevent topic branches that are already# merged to 'next' branch from getting rebased, because allowing it# would result in rebasing already published history.
publish=nextbasebranch="$1"if test "$#" = 2then topic="refs/heads/$2"else topic=`git symbolic-ref HEAD` || exit 0 ;# we do not interrupt rebasing detached HEADfi
case "$topic" inrefs/heads/??/*) ;;*) exit 0 ;# we do not interrupt others. ;;esac
# Now we are dealing with a topic branch being rebased# on top of master. Is it OK to rebase it?
# Does the topic really exist?git show-ref -q "$topic" || { echo >&2 "No such branch $topic" exit 1}
# Is topic fully merged to master?not_in_master=`git rev-list --pretty=oneline ^master "$topic"`if test -z "$not_in_master"then echo >&2 "$topic is fully merged to master; better remove it." exit 1 ;# we could allow it, but there is no point.fi
# Is topic ever merged to next? If so you should not be rebasing it.only_next_1=`git rev-list ^master "^$topic" ${publish} | sort`only_next_2=`git rev-list ^master ${publish} | sort`if test "$only_next_1" = "$only_next_2"then not_in_topic=`git rev-list "^$topic" master` if test -z "$not_in_topic" then echo >&2 "$topic is already up to date with master" exit 1 ;# we could allow it, but there is no point. else exit 0 fielse not_in_next=`git rev-list --pretty=oneline ^${publish} "$topic"` /usr/bin/perl -e ' my $topic = $ARGV[0]; my $msg = "* $topic has commits already merged to public branch:\n"; my (%not_in_next) = map { /^([0-9a-f]+) /; ($1 => 1); } split(/\n/, $ARGV[1]); for my $elem (map { /^([0-9a-f]+) (.*)$/; [$1 => $2]; } split(/\n/, $ARGV[2])) { if (!exists $not_in_next{$elem->[0]}) { if ($msg) { print STDERR $msg; undef $msg; } print STDERR " $elem->[1]\n"; } } ' "$topic" "$not_in_next" "$not_in_master" exit 1fi
<<\DOC_END
This sample hook safeguards topic branches that have beenpublished from being rewound.
The workflow assumed here is:
* Once a topic branch forks from "master", "master" is never merged into it again (either directly or indirectly).
* Once a topic branch is fully cooked and merged into "master", it is deleted. If you need to build on top of it to correct earlier mistakes, a new topic branch is created by forking at the tip of the "master". This is not strictly necessary, but it makes it easier to keep your history simple.
* Whenever you need to test or publish your changes to topic branches, merge them into "next" branch.
The script, being an example, hardcodes the publish branch nameto be "next", but it is trivial to make it configurable via$GIT_DIR/config mechanism.
With this workflow, you would want to know:
(1) ... if a topic branch has ever been merged to "next". Young topic branches can have stupid mistakes you would rather clean up before publishing, and things that have not been merged into other branches can be easily rebased without affecting other people. But once it is published, you would not want to rewind it.
(2) ... if a topic branch has been fully merged to "master". Then you can delete it. More importantly, you should not build on top of it -- other people may already want to change things related to the topic as patches against your "master", so if you need further changes, it is better to fork the topic (perhaps with the same name) afresh from the tip of "master".
Let's look at this example:
o---o---o---o---o---o---o---o---o---o "next" / / / / / a---a---b A / / / / / / / / c---c---c---c B / / / / \ / / / / b---b C \ / / / / / \ / ---o---o---o---o---o---o---o---o---o---o---o "master"
A, B and C are topic branches.
* A has one fix since it was merged up to "next".
* B has finished. It has been fully merged up to "master" and "next", and is ready to be deleted.
* C has not merged to "next" at all.
We would want to allow C to be rebased, refuse A, and encourageB to be deleted.
To compute (1):
git rev-list ^master ^topic next git rev-list ^master next
if these match, topic has not merged in next at all.
To compute (2):
git rev-list master..topic
if this is empty, it is fully merged to "master".
DOC_END
example/advanced-reddit-scraper/.git/hooks/pre-receive.sample
#!/bin/sh## An example hook script to make use of push options.# The example simply echoes all push options that start with 'echoback='# and rejects all pushes when the "reject" push option is used.## To enable this hook, rename this file to "pre-receive".
if test -n "$GIT_PUSH_OPTION_COUNT"then i=0 while test "$i" -lt "$GIT_PUSH_OPTION_COUNT" do eval "value=\$GIT_PUSH_OPTION_$i" case "$value" in echoback=*) echo "echo from the pre-receive-hook: ${value#*=}" >&2 ;; reject) exit 1 esac i=$((i + 1)) donefi
example/advanced-reddit-scraper/.git/hooks/prepare-commit-msg.sample
#!/bin/sh## An example hook script to prepare the commit log message.# Called by "git commit" with the name of the file that has the# commit message, followed by the description of the commit# message's source. The hook's purpose is to edit the commit# message file. If the hook fails with a non-zero status,# the commit is aborted.## To enable this hook, rename this file to "prepare-commit-msg".
# This hook includes three examples. The first one removes the# "# Please enter the commit message..." help message.## The second includes the output of "git diff --name-status -r"# into the message, just before the "git status" output. It is# commented because it doesn't cope with --amend or with squashed# commits.## The third example adds a Signed-off-by line to the message, that can# still be edited. This is rarely a good idea.
COMMIT_MSG_FILE=$1COMMIT_SOURCE=$2SHA1=$3
/usr/bin/perl -i.bak -ne 'print unless(m/^. Please enter the commit message/..m/^#$/)' "$COMMIT_MSG_FILE"
# case "$COMMIT_SOURCE,$SHA1" in# ,|template,)# /usr/bin/perl -i.bak -pe '# print "\n" . `git diff --cached --name-status -r`# if /^#/ && $first++ == 0' "$COMMIT_MSG_FILE" ;;# *) ;;# esac
# SOB=$(git var GIT_COMMITTER_IDENT | sed -n 's/^\(.*>\).*$/Signed-off-by: \1/p')# git interpret-trailers --in-place --trailer "$SOB" "$COMMIT_MSG_FILE"# if test -z "$COMMIT_SOURCE"# then# /usr/bin/perl -i.bak -pe 'print "\n" if !$first_line++' "$COMMIT_MSG_FILE"# fi
example/advanced-reddit-scraper/.git/hooks/push-to-checkout.sample
#!/bin/sh
# An example hook script to update a checked-out tree on a git push.## This hook is invoked by git-receive-pack(1) when it reacts to git# push and updates reference(s) in its repository, and when the push# tries to update the branch that is currently checked out and the# receive.denyCurrentBranch configuration variable is set to# updateInstead.## By default, such a push is refused if the working tree and the index# of the remote repository has any difference from the currently# checked out commit; when both the working tree and the index match# the current commit, they are updated to match the newly pushed tip# of the branch. This hook is to be used to override the default# behaviour; however the code below reimplements the default behaviour# as a starting point for convenient modification.## The hook receives the commit with which the tip of the current# branch is going to be updated:commit=$1
# It can exit with a non-zero status to refuse the push (when it does# so, it must not modify the index or the working tree).die () { echo >&2 "$*" exit 1}
# Or it can make any necessary changes to the working tree and to the# index to bring them to the desired state when the tip of the current# branch is updated to the new commit, and exit with a zero status.## For example, the hook can simply run git read-tree -u -m HEAD "$1"# in order to emulate git fetch that is run in the reverse direction# with git push, as the two-tree form of git read-tree -u -m is# essentially the same as git switch or git checkout that switches# branches while keeping the local changes in the working tree that do# not interfere with the difference between the branches.
# The below is a more-or-less exact translation to shell of the C code# for the default behaviour for git's push-to-checkout hook defined in# the push_to_deploy() function in builtin/receive-pack.c.## Note that the hook will be executed from the repository directory,# not from the working tree, so if you want to perform operations on# the working tree, you will have to adapt your code accordingly, e.g.# by adding "cd .." or using relative paths.
if ! git update-index -q --ignore-submodules --refreshthen die "Up-to-date check failed"fi
if ! git diff-files --quiet --ignore-submodules --then die "Working directory has unstaged changes"fi
# This is a rough translation of:## head_has_history() ? "HEAD" : EMPTY_TREE_SHA1_HEXif git cat-file -e HEAD 2>/dev/nullthen head=HEADelse head=$(git hash-object -t tree --stdin </dev/null)fi
if ! git diff-index --quiet --cached --ignore-submodules $head --then die "Working directory has staged changes"fi
if ! git read-tree -u -m "$commit"then die "Could not update working tree to new HEAD"fi
example/advanced-reddit-scraper/.git/hooks/sendemail-validate.sample
#!/bin/sh
# An example hook script to validate a patch (and/or patch series) before# sending it via email.## The hook should exit with non-zero status after issuing an appropriate# message if it wants to prevent the email(s) from being sent.## To enable this hook, rename this file to "sendemail-validate".## By default, it will only check that the patch(es) can be applied on top of# the default upstream branch without conflicts in a secondary worktree. After# validation (successful or not) of the last patch of a series, the worktree# will be deleted.## The following config variables can be set to change the default remote and# remote ref that are used to apply the patches against:## sendemail.validateRemote (default: origin)# sendemail.validateRemoteRef (default: HEAD)## Replace the TODO placeholders with appropriate checks according to your# needs.
validate_cover_letter () { file="$1" # TODO: Replace with appropriate checks (e.g. spell checking). true}
validate_patch () { file="$1" # Ensure that the patch applies without conflicts. git am -3 "$file" || return # TODO: Replace with appropriate checks for this patch # (e.g. checkpatch.pl). true}
validate_series () { # TODO: Replace with appropriate checks for the whole series # (e.g. quick build, coding style checks, etc.). true}
# main -------------------------------------------------------------------------
if test "$GIT_SENDEMAIL_FILE_COUNTER" = 1then remote=$(git config --default origin --get sendemail.validateRemote) && ref=$(git config --default HEAD --get sendemail.validateRemoteRef) && worktree=$(mktemp --tmpdir -d sendemail-validate.XXXXXXX) && git worktree add -fd --checkout "$worktree" "refs/remotes/$remote/$ref" && git config --replace-all sendemail.validateWorktree "$worktree"else worktree=$(git config --get sendemail.validateWorktree)fi || { echo "sendemail-validate: error: failed to prepare worktree" >&2 exit 1}
unset GIT_DIR GIT_WORK_TREEcd "$worktree" &&
if grep -q "^diff --git " "$1"then validate_patch "$1"else validate_cover_letter "$1"fi &&
if test "$GIT_SENDEMAIL_FILE_COUNTER" = "$GIT_SENDEMAIL_FILE_TOTAL"then git config --unset-all sendemail.validateWorktree && trap 'git worktree remove -ff "$worktree"' EXIT && validate_seriesfi
example/advanced-reddit-scraper/.git/hooks/update.sample
#!/bin/sh## An example hook script to block unannotated tags from entering.# Called by "git receive-pack" with arguments: refname sha1-old sha1-new## To enable this hook, rename this file to "update".## Config# ------# hooks.allowunannotated# This boolean sets whether unannotated tags will be allowed into the# repository. By default they won't be.# hooks.allowdeletetag# This boolean sets whether deleting tags will be allowed in the# repository. By default they won't be.# hooks.allowmodifytag# This boolean sets whether a tag may be modified after creation. By default# it won't be.# hooks.allowdeletebranch# This boolean sets whether deleting branches will be allowed in the# repository. By default they won't be.# hooks.denycreatebranch# This boolean sets whether remotely creating branches will be denied# in the repository. By default this is allowed.#
# --- Command linerefname="$1"oldrev="$2"newrev="$3"
# --- Safety checkif [ -z "$GIT_DIR" ]; then echo "Don't run this script from the command line." >&2 echo " (if you want, you could supply GIT_DIR then run" >&2 echo " $0 <ref> <oldrev> <newrev>)" >&2 exit 1fi
if [ -z "$refname" -o -z "$oldrev" -o -z "$newrev" ]; then echo "usage: $0 <ref> <oldrev> <newrev>" >&2 exit 1fi
# --- Configallowunannotated=$(git config --type=bool hooks.allowunannotated)allowdeletebranch=$(git config --type=bool hooks.allowdeletebranch)denycreatebranch=$(git config --type=bool hooks.denycreatebranch)allowdeletetag=$(git config --type=bool hooks.allowdeletetag)allowmodifytag=$(git config --type=bool hooks.allowmodifytag)
# check for no descriptionprojectdesc=$(sed -e '1q' "$GIT_DIR/description")case "$projectdesc" in"Unnamed repository"* | "") echo "*** Project description file hasn't been set" >&2 exit 1 ;;esac
# --- Check types# if $newrev is 0000...0000, it's a commit to delete a ref.zero=$(git hash-object --stdin </dev/null | tr '[0-9a-f]' '0')if [ "$newrev" = "$zero" ]; then newrev_type=deleteelse newrev_type=$(git cat-file -t $newrev)fi
case "$refname","$newrev_type" in refs/tags/*,commit) # un-annotated tag short_refname=${refname##refs/tags/} if [ "$allowunannotated" != "true" ]; then echo "*** The un-annotated tag, $short_refname, is not allowed in this repository" >&2 echo "*** Use 'git tag [ -a | -s ]' for tags you want to propagate." >&2 exit 1 fi ;; refs/tags/*,delete) # delete tag if [ "$allowdeletetag" != "true" ]; then echo "*** Deleting a tag is not allowed in this repository" >&2 exit 1 fi ;; refs/tags/*,tag) # annotated tag if [ "$allowmodifytag" != "true" ] && git rev-parse $refname > /dev/null 2>&1 then echo "*** Tag '$refname' already exists." >&2 echo "*** Modifying a tag is not allowed in this repository." >&2 exit 1 fi ;; refs/heads/*,commit) # branch if [ "$oldrev" = "$zero" -a "$denycreatebranch" = "true" ]; then echo "*** Creating a branch is not allowed in this repository" >&2 exit 1 fi ;; refs/heads/*,delete) # delete branch if [ "$allowdeletebranch" != "true" ]; then echo "*** Deleting a branch is not allowed in this repository" >&2 exit 1 fi ;; refs/remotes/*,commit) # tracking branch ;; refs/remotes/*,delete) # delete tracking branch if [ "$allowdeletebranch" != "true" ]; then echo "*** Deleting a tracking branch is not allowed in this repository" >&2 exit 1 fi ;; *) # Anything else (is there anything else?) echo "*** Update hook: unknown type of update to ref $refname of type $newrev_type" >&2 exit 1 ;;esac
# --- Finishedexit 0
example/advanced-reddit-scraper/.git/logs/HEAD
0000000000000000000000000000000000000000 68034258495c18bfb133b925e51bbce4b07c2cf2 deduble <yunusemremre@gmail.com> 1727917571 +0300 commit (initial): init68034258495c18bfb133b925e51bbce4b07c2cf2 0000000000000000000000000000000000000000 deduble <yunusemremre@gmail.com> 1727917683 +0300 Branch: renamed refs/heads/master to refs/heads/main0000000000000000000000000000000000000000 68034258495c18bfb133b925e51bbce4b07c2cf2 deduble <yunusemremre@gmail.com> 1727917683 +0300 Branch: renamed refs/heads/master to refs/heads/main68034258495c18bfb133b925e51bbce4b07c2cf2 d39c6a0e47a92ee8fc7f4baace6c8c2ef406bb45 deduble <yunusemremre@gmail.com> 1730031615 +0300 commit: using illegal apid39c6a0e47a92ee8fc7f4baace6c8c2ef406bb45 c8a66ed48d5f42d9fea6c24f6722018721d2c0a1 deduble <yunusemremre@gmail.com> 1739419029 +0300 commit: v1c8a66ed48d5f42d9fea6c24f6722018721d2c0a1 f36f5c6f3e4df6f647792db86053f288851ad990 deduble <yunusemremre@gmail.com> 1739420381 +0300 commit: release
example/advanced-reddit-scraper/src/__pycache__/main.cpython-312.pyc
Downloadexample/advanced-reddit-scraper/src/__pycache__/reddit.cpython-312.pyc
Downloadexample/advanced-reddit-scraper/src/__pycache__/redditor.cpython-312.pyc
Downloadexample/advanced-reddit-scraper/src/__pycache__/session.cpython-312.pyc
Downloadexample/advanced-reddit-scraper/src/__pycache__/__main__.cpython-312.pyc
Downloadexample/advanced-reddit-scraper/.git/objects/01/fe73e828e238ad7552d0d78359d99346f66976
Downloadexample/advanced-reddit-scraper/.git/objects/17/dccf81f638ab88fec85008e15a30e05de19877
Downloadexample/advanced-reddit-scraper/.git/objects/1d/59e0ed9229e85292631f660cce4af209b785b3
Downloadexample/advanced-reddit-scraper/.git/objects/1e/e8ab06eba9d300f69f072647853b4d62ee298a
Downloadexample/advanced-reddit-scraper/.git/objects/1f/7481b8559d3f11b9a5a7648a3aa5a5a05dd531
Downloadexample/advanced-reddit-scraper/.git/objects/1f/faf74e22ef23eb3a921262e20729c2b24ef690
Downloadexample/advanced-reddit-scraper/.git/objects/5c/a4deda17fcf053a1705f70aba40b1bf25281e2
Downloadexample/advanced-reddit-scraper/.git/objects/5e/6f1303c550883e16073bc4311e3b7c865b0ecb
Downloadexample/advanced-reddit-scraper/.git/objects/65/bd461b426ed5f9b7f3ffcba098c0731caca73b
Downloadexample/advanced-reddit-scraper/.git/objects/68/034258495c18bfb133b925e51bbce4b07c2cf2
Downloadexample/advanced-reddit-scraper/.git/objects/6b/379c02244bbfd26691bd18cdc16f035a32e30c
Downloadexample/advanced-reddit-scraper/.git/objects/6e/b49d35e2ee97a2802f1f2856c0ce992b1853a7
Downloadexample/advanced-reddit-scraper/.git/objects/81/2ba43c6f6467ac8a78d4f1d604a0657958875d
Downloadexample/advanced-reddit-scraper/.git/objects/83/7427c84eb5b8cfafb7def87871ba0a18fed7cf
Downloadexample/advanced-reddit-scraper/.git/objects/84/0a359e8a157c8775b63a90af2aa76e6c8bd7db
Downloadexample/advanced-reddit-scraper/.git/objects/85/d63c75f9bb3c33e272284381f470a0360eae1b
Downloadexample/advanced-reddit-scraper/.git/objects/86/38ecf130d57c0931d9f989f46dd82843f3ac5b
Downloadexample/advanced-reddit-scraper/.git/objects/8f/3b4bb4e6848ea6a83eeb0296ab347583102674
Downloadexample/advanced-reddit-scraper/.git/objects/92/511f33669b448318aaeb62d839f94b6d6a3dd3
Downloadexample/advanced-reddit-scraper/.git/objects/9b/b76e6ae6f1bdf77d036f3fc9a9c488c41fcefd
Downloadexample/advanced-reddit-scraper/.git/objects/a3/e82ad054e006d638c99f853e24c3051e35cce2
Downloadexample/advanced-reddit-scraper/.git/objects/ae/dce4aa3907d1600116f7e70d076f583dbf9928
Downloadexample/advanced-reddit-scraper/.git/objects/b9/7ba54c96388855906965dde76c36d1897ca309
Downloadexample/advanced-reddit-scraper/.git/objects/bc/eb6e5e1b3d54d438d5030fd5cffc837072b816
Downloadexample/advanced-reddit-scraper/.git/objects/c4/7f4c6523f8273b1c9f8c1014f96512ac4be855
Downloadexample/advanced-reddit-scraper/.git/objects/c4/bd7187db208a3aa4109eac9c12530387bdf740
Downloadexample/advanced-reddit-scraper/.git/objects/c8/a66ed48d5f42d9fea6c24f6722018721d2c0a1
Downloadexample/advanced-reddit-scraper/.git/objects/ca/b93bc8c6509d4dda383bef6294b4b6211ed8c4
Downloadexample/advanced-reddit-scraper/.git/objects/d1/87efc3a2a2f86394a568fbed9bbee43b72d378
Downloadexample/advanced-reddit-scraper/.git/objects/d3/7f42b765bfb553100f49eb289e97909e349732
Downloadexample/advanced-reddit-scraper/.git/objects/d3/9c6a0e47a92ee8fc7f4baace6c8c2ef406bb45
Downloadexample/advanced-reddit-scraper/.git/objects/d6/4855c06228adcbe500c652ea2751c82a7dc7d1
Downloadexample/advanced-reddit-scraper/.git/objects/d9/8007b2c5fca88a8ee506a46f387c27745a488c
Downloadexample/advanced-reddit-scraper/.git/objects/ed/c46777034ef1b655f5af883a4d84cdf03bd6a7
Downloadexample/advanced-reddit-scraper/.git/objects/f3/6f5c6f3e4df6f647792db86053f288851ad990
Downloadexample/advanced-reddit-scraper/.git/objects/fc/2fe4bb281877272221d421a43a50ff90b05a56
Downloadexample/advanced-reddit-scraper/.git/refs/heads/main
f36f5c6f3e4df6f647792db86053f288851ad990
example/advanced-reddit-scraper/.git/logs/refs/heads/main
0000000000000000000000000000000000000000 68034258495c18bfb133b925e51bbce4b07c2cf2 deduble <yunusemremre@gmail.com> 1727917571 +0300 commit (initial): init68034258495c18bfb133b925e51bbce4b07c2cf2 68034258495c18bfb133b925e51bbce4b07c2cf2 deduble <yunusemremre@gmail.com> 1727917683 +0300 Branch: renamed refs/heads/master to refs/heads/main68034258495c18bfb133b925e51bbce4b07c2cf2 d39c6a0e47a92ee8fc7f4baace6c8c2ef406bb45 deduble <yunusemremre@gmail.com> 1730031615 +0300 commit: using illegal apid39c6a0e47a92ee8fc7f4baace6c8c2ef406bb45 c8a66ed48d5f42d9fea6c24f6722018721d2c0a1 deduble <yunusemremre@gmail.com> 1739419029 +0300 commit: v1c8a66ed48d5f42d9fea6c24f6722018721d2c0a1 f36f5c6f3e4df6f647792db86053f288851ad990 deduble <yunusemremre@gmail.com> 1739420381 +0300 commit: release
example/advanced-reddit-scraper/.git/refs/remotes/origin/main
f36f5c6f3e4df6f647792db86053f288851ad990
example/advanced-reddit-scraper/.git/logs/refs/remotes/origin/main
0000000000000000000000000000000000000000 68034258495c18bfb133b925e51bbce4b07c2cf2 deduble <yunusemremre@gmail.com> 1727917690 +0300 update by push68034258495c18bfb133b925e51bbce4b07c2cf2 d39c6a0e47a92ee8fc7f4baace6c8c2ef406bb45 deduble <yunusemremre@gmail.com> 1730031636 +0300 update by pushd39c6a0e47a92ee8fc7f4baace6c8c2ef406bb45 c8a66ed48d5f42d9fea6c24f6722018721d2c0a1 deduble <yunusemremre@gmail.com> 1739419037 +0300 update by pushc8a66ed48d5f42d9fea6c24f6722018721d2c0a1 f36f5c6f3e4df6f647792db86053f288851ad990 deduble <yunusemremre@gmail.com> 1739420388 +0300 update by push