From 50a697ac994939235418c2d5435bbb0297cf2443 Mon Sep 17 00:00:00 2001 From: "user.mail" Date: Tue, 3 Mar 2026 04:35:44 +0300 Subject: [PATCH 1/2] add browserapi support as well as rest of the datasets --- CHANGELOG.md | 8 + notebooks/04_web_unlocker.ipynb | 4 +- notebooks/05_scraper_studio.ipynb | 313 ++++++++ notebooks/06_browser_api.ipynb | 518 +++++++++++++ notebooks/browser_screenshot.png | Bin 0 -> 26998 bytes notebooks/datasets/amazon/amazon.ipynb | 22 +- pyproject.toml | 2 +- src/brightdata/__init__.py | 10 + src/brightdata/api/browser/__init__.py | 1 - src/brightdata/api/browser/browser_api.py | 1 - src/brightdata/api/browser/browser_pool.py | 1 - src/brightdata/api/browser/config.py | 1 - src/brightdata/api/browser/session.py | 1 - src/brightdata/api/browser_service.py | 62 ++ src/brightdata/api/scraper_studio_service.py | 147 ++++ src/brightdata/client.py | 91 ++- src/brightdata/constants.py | 10 + src/brightdata/core/zone_manager.py | 10 - src/brightdata/datasets/__init__.py | 208 +++++- src/brightdata/datasets/agoda/__init__.py | 5 + src/brightdata/datasets/agoda/properties.py | 25 + src/brightdata/datasets/amazon/__init__.py | 14 +- .../datasets/amazon/best_sellers.py | 25 + .../datasets/amazon/products_global.py | 25 + .../datasets/amazon/products_search.py | 25 + src/brightdata/datasets/amazon/walmart.py | 25 + .../datasets/apple_appstore/__init__.py | 6 + .../datasets/apple_appstore/reviews.py | 25 + .../datasets/apple_appstore/store.py | 25 + src/brightdata/datasets/autozone/__init__.py | 5 + src/brightdata/datasets/autozone/products.py | 25 + src/brightdata/datasets/bbc/__init__.py | 5 + src/brightdata/datasets/bbc/news.py | 25 + src/brightdata/datasets/bestbuy/__init__.py | 5 + src/brightdata/datasets/bestbuy/products.py | 25 + src/brightdata/datasets/bh/__init__.py | 5 + src/brightdata/datasets/bh/products.py | 25 + src/brightdata/datasets/bluesky/__init__.py | 6 + src/brightdata/datasets/bluesky/posts.py | 25 + .../datasets/bluesky/top_profiles.py | 25 + src/brightdata/datasets/booking/__init__.py | 6 + .../datasets/booking/hotel_listings.py | 25 + .../datasets/booking/listings_search.py | 25 + src/brightdata/datasets/carsales/__init__.py | 5 + src/brightdata/datasets/carsales/listings.py | 25 + src/brightdata/datasets/client.py | 702 +++++++++++++++++- src/brightdata/datasets/cnn/__init__.py | 5 + src/brightdata/datasets/cnn/news.py | 25 + src/brightdata/datasets/costco/__init__.py | 5 + src/brightdata/datasets/costco/products.py | 25 + .../datasets/creative_commons/__init__.py | 6 + .../datasets/creative_commons/images.py | 25 + .../datasets/creative_commons/models_3d.py | 25 + src/brightdata/datasets/ebay/__init__.py | 5 + src/brightdata/datasets/ebay/products.py | 25 + src/brightdata/datasets/etsy/__init__.py | 5 + src/brightdata/datasets/etsy/products.py | 25 + src/brightdata/datasets/facebook/__init__.py | 22 +- src/brightdata/datasets/facebook/comments.py | 25 + .../datasets/facebook/company_reviews.py | 25 + src/brightdata/datasets/facebook/events.py | 25 + .../datasets/facebook/group_posts.py | 25 + .../datasets/facebook/marketplace.py | 25 + .../datasets/facebook/pages_profiles.py | 25 + .../datasets/facebook/posts_by_url.py | 25 + src/brightdata/datasets/facebook/profiles.py | 25 + src/brightdata/datasets/facebook/reels.py | 25 + src/brightdata/datasets/github/__init__.py | 5 + .../datasets/github/repositories.py | 25 + .../datasets/google_maps/__init__.py | 6 +- .../datasets/google_maps/full_info.py | 25 + .../datasets/google_news/__init__.py | 5 + src/brightdata/datasets/google_news/news.py | 25 + .../datasets/google_play/__init__.py | 6 + .../datasets/google_play/reviews.py | 25 + src/brightdata/datasets/google_play/store.py | 25 + .../datasets/google_shopping/__init__.py | 6 + .../datasets/google_shopping/products.py | 25 + .../datasets/google_shopping/search_us.py | 25 + src/brightdata/datasets/homedepot/__init__.py | 6 + .../datasets/homedepot/products_ca.py | 25 + .../datasets/homedepot/products_us.py | 25 + src/brightdata/datasets/instagram/__init__.py | 9 +- src/brightdata/datasets/instagram/comments.py | 25 + src/brightdata/datasets/instagram/reels.py | 25 + src/brightdata/datasets/kroger/__init__.py | 5 + src/brightdata/datasets/kroger/products.py | 25 + src/brightdata/datasets/lazada/__init__.py | 8 +- .../datasets/lazada/products_search.py | 25 + src/brightdata/datasets/lazada/reviews.py | 25 + src/brightdata/datasets/linkedin/__init__.py | 10 +- src/brightdata/datasets/linkedin/posts.py | 25 + .../linkedin/profiles_job_listings.py | 25 + src/brightdata/datasets/lowes/__init__.py | 5 + src/brightdata/datasets/lowes/products.py | 25 + src/brightdata/datasets/macys/__init__.py | 5 + src/brightdata/datasets/macys/products.py | 25 + .../datasets/mercadolivre/__init__.py | 5 + .../datasets/mercadolivre/products.py | 25 + .../datasets/microcenter/__init__.py | 5 + .../datasets/microcenter/products.py | 25 + src/brightdata/datasets/myntra/__init__.py | 5 + src/brightdata/datasets/myntra/products.py | 25 + src/brightdata/datasets/naver/__init__.py | 5 + src/brightdata/datasets/naver/products.py | 25 + src/brightdata/datasets/ozon/__init__.py | 5 + src/brightdata/datasets/ozon/products.py | 25 + src/brightdata/datasets/quora/__init__.py | 5 + src/brightdata/datasets/quora/posts.py | 25 + src/brightdata/datasets/realtor/__init__.py | 5 + .../realtor/international_properties.py | 25 + src/brightdata/datasets/reddit/__init__.py | 6 + src/brightdata/datasets/reddit/comments.py | 25 + src/brightdata/datasets/reddit/posts.py | 25 + src/brightdata/datasets/rona/__init__.py | 5 + src/brightdata/datasets/rona/products.py | 25 + src/brightdata/datasets/snapchat/__init__.py | 5 + src/brightdata/datasets/snapchat/posts.py | 25 + src/brightdata/datasets/tiktok/__init__.py | 10 +- src/brightdata/datasets/tiktok/comments.py | 25 + src/brightdata/datasets/tiktok/posts.py | 25 + src/brightdata/datasets/tiktok/shop.py | 25 + src/brightdata/datasets/tokopedia/__init__.py | 5 + src/brightdata/datasets/tokopedia/products.py | 25 + src/brightdata/datasets/vimeo/__init__.py | 5 + src/brightdata/datasets/vimeo/videos.py | 25 + src/brightdata/datasets/walmart/__init__.py | 6 +- src/brightdata/datasets/walmart/sellers.py | 25 + src/brightdata/datasets/wayfair/__init__.py | 5 + src/brightdata/datasets/wayfair/products.py | 25 + src/brightdata/datasets/wikipedia/__init__.py | 5 + src/brightdata/datasets/wikipedia/articles.py | 25 + .../datasets/wildberries/__init__.py | 5 + .../datasets/wildberries/products.py | 25 + src/brightdata/datasets/x_twitter/__init__.py | 6 + src/brightdata/datasets/x_twitter/posts.py | 25 + src/brightdata/datasets/x_twitter/profiles.py | 25 + .../datasets/yahoo_finance/__init__.py | 5 + .../datasets/yahoo_finance/businesses.py | 25 + src/brightdata/datasets/zillow/__init__.py | 6 +- .../datasets/zillow/price_history.py | 25 + src/brightdata/datasets/zoopla/__init__.py | 5 + src/brightdata/datasets/zoopla/properties.py | 25 + src/brightdata/scraper_studio/__init__.py | 5 + src/brightdata/scraper_studio/client.py | 137 ++++ src/brightdata/scraper_studio/models.py | 148 ++++ src/brightdata/sync_client.py | 62 +- tests/enes/chatgpt_02.py | 6 +- tests/enes/serp.py | 6 +- tests/enes/zones/cache_fix.py | 12 +- tests/enes/zones/dash_sync.py | 8 +- tests/enes/zones/permission.py | 18 +- tests/unit/test_client.py | 3 - tests/unit/test_zone_manager.py | 1 - 154 files changed, 4588 insertions(+), 80 deletions(-) create mode 100644 notebooks/05_scraper_studio.ipynb create mode 100644 notebooks/06_browser_api.ipynb create mode 100644 notebooks/browser_screenshot.png delete mode 100644 src/brightdata/api/browser/__init__.py delete mode 100644 src/brightdata/api/browser/browser_api.py delete mode 100644 src/brightdata/api/browser/browser_pool.py delete mode 100644 src/brightdata/api/browser/config.py delete mode 100644 src/brightdata/api/browser/session.py create mode 100644 src/brightdata/api/browser_service.py create mode 100644 src/brightdata/api/scraper_studio_service.py create mode 100644 src/brightdata/datasets/agoda/__init__.py create mode 100644 src/brightdata/datasets/agoda/properties.py create mode 100644 src/brightdata/datasets/amazon/best_sellers.py create mode 100644 src/brightdata/datasets/amazon/products_global.py create mode 100644 src/brightdata/datasets/amazon/products_search.py create mode 100644 src/brightdata/datasets/amazon/walmart.py create mode 100644 src/brightdata/datasets/apple_appstore/__init__.py create mode 100644 src/brightdata/datasets/apple_appstore/reviews.py create mode 100644 src/brightdata/datasets/apple_appstore/store.py create mode 100644 src/brightdata/datasets/autozone/__init__.py create mode 100644 src/brightdata/datasets/autozone/products.py create mode 100644 src/brightdata/datasets/bbc/__init__.py create mode 100644 src/brightdata/datasets/bbc/news.py create mode 100644 src/brightdata/datasets/bestbuy/__init__.py create mode 100644 src/brightdata/datasets/bestbuy/products.py create mode 100644 src/brightdata/datasets/bh/__init__.py create mode 100644 src/brightdata/datasets/bh/products.py create mode 100644 src/brightdata/datasets/bluesky/__init__.py create mode 100644 src/brightdata/datasets/bluesky/posts.py create mode 100644 src/brightdata/datasets/bluesky/top_profiles.py create mode 100644 src/brightdata/datasets/booking/__init__.py create mode 100644 src/brightdata/datasets/booking/hotel_listings.py create mode 100644 src/brightdata/datasets/booking/listings_search.py create mode 100644 src/brightdata/datasets/carsales/__init__.py create mode 100644 src/brightdata/datasets/carsales/listings.py create mode 100644 src/brightdata/datasets/cnn/__init__.py create mode 100644 src/brightdata/datasets/cnn/news.py create mode 100644 src/brightdata/datasets/costco/__init__.py create mode 100644 src/brightdata/datasets/costco/products.py create mode 100644 src/brightdata/datasets/creative_commons/__init__.py create mode 100644 src/brightdata/datasets/creative_commons/images.py create mode 100644 src/brightdata/datasets/creative_commons/models_3d.py create mode 100644 src/brightdata/datasets/ebay/__init__.py create mode 100644 src/brightdata/datasets/ebay/products.py create mode 100644 src/brightdata/datasets/etsy/__init__.py create mode 100644 src/brightdata/datasets/etsy/products.py create mode 100644 src/brightdata/datasets/facebook/comments.py create mode 100644 src/brightdata/datasets/facebook/company_reviews.py create mode 100644 src/brightdata/datasets/facebook/events.py create mode 100644 src/brightdata/datasets/facebook/group_posts.py create mode 100644 src/brightdata/datasets/facebook/marketplace.py create mode 100644 src/brightdata/datasets/facebook/pages_profiles.py create mode 100644 src/brightdata/datasets/facebook/posts_by_url.py create mode 100644 src/brightdata/datasets/facebook/profiles.py create mode 100644 src/brightdata/datasets/facebook/reels.py create mode 100644 src/brightdata/datasets/github/__init__.py create mode 100644 src/brightdata/datasets/github/repositories.py create mode 100644 src/brightdata/datasets/google_maps/full_info.py create mode 100644 src/brightdata/datasets/google_news/__init__.py create mode 100644 src/brightdata/datasets/google_news/news.py create mode 100644 src/brightdata/datasets/google_play/__init__.py create mode 100644 src/brightdata/datasets/google_play/reviews.py create mode 100644 src/brightdata/datasets/google_play/store.py create mode 100644 src/brightdata/datasets/google_shopping/__init__.py create mode 100644 src/brightdata/datasets/google_shopping/products.py create mode 100644 src/brightdata/datasets/google_shopping/search_us.py create mode 100644 src/brightdata/datasets/homedepot/__init__.py create mode 100644 src/brightdata/datasets/homedepot/products_ca.py create mode 100644 src/brightdata/datasets/homedepot/products_us.py create mode 100644 src/brightdata/datasets/instagram/comments.py create mode 100644 src/brightdata/datasets/instagram/reels.py create mode 100644 src/brightdata/datasets/kroger/__init__.py create mode 100644 src/brightdata/datasets/kroger/products.py create mode 100644 src/brightdata/datasets/lazada/products_search.py create mode 100644 src/brightdata/datasets/lazada/reviews.py create mode 100644 src/brightdata/datasets/linkedin/posts.py create mode 100644 src/brightdata/datasets/linkedin/profiles_job_listings.py create mode 100644 src/brightdata/datasets/lowes/__init__.py create mode 100644 src/brightdata/datasets/lowes/products.py create mode 100644 src/brightdata/datasets/macys/__init__.py create mode 100644 src/brightdata/datasets/macys/products.py create mode 100644 src/brightdata/datasets/mercadolivre/__init__.py create mode 100644 src/brightdata/datasets/mercadolivre/products.py create mode 100644 src/brightdata/datasets/microcenter/__init__.py create mode 100644 src/brightdata/datasets/microcenter/products.py create mode 100644 src/brightdata/datasets/myntra/__init__.py create mode 100644 src/brightdata/datasets/myntra/products.py create mode 100644 src/brightdata/datasets/naver/__init__.py create mode 100644 src/brightdata/datasets/naver/products.py create mode 100644 src/brightdata/datasets/ozon/__init__.py create mode 100644 src/brightdata/datasets/ozon/products.py create mode 100644 src/brightdata/datasets/quora/__init__.py create mode 100644 src/brightdata/datasets/quora/posts.py create mode 100644 src/brightdata/datasets/realtor/__init__.py create mode 100644 src/brightdata/datasets/realtor/international_properties.py create mode 100644 src/brightdata/datasets/reddit/__init__.py create mode 100644 src/brightdata/datasets/reddit/comments.py create mode 100644 src/brightdata/datasets/reddit/posts.py create mode 100644 src/brightdata/datasets/rona/__init__.py create mode 100644 src/brightdata/datasets/rona/products.py create mode 100644 src/brightdata/datasets/snapchat/__init__.py create mode 100644 src/brightdata/datasets/snapchat/posts.py create mode 100644 src/brightdata/datasets/tiktok/comments.py create mode 100644 src/brightdata/datasets/tiktok/posts.py create mode 100644 src/brightdata/datasets/tiktok/shop.py create mode 100644 src/brightdata/datasets/tokopedia/__init__.py create mode 100644 src/brightdata/datasets/tokopedia/products.py create mode 100644 src/brightdata/datasets/vimeo/__init__.py create mode 100644 src/brightdata/datasets/vimeo/videos.py create mode 100644 src/brightdata/datasets/walmart/sellers.py create mode 100644 src/brightdata/datasets/wayfair/__init__.py create mode 100644 src/brightdata/datasets/wayfair/products.py create mode 100644 src/brightdata/datasets/wikipedia/__init__.py create mode 100644 src/brightdata/datasets/wikipedia/articles.py create mode 100644 src/brightdata/datasets/wildberries/__init__.py create mode 100644 src/brightdata/datasets/wildberries/products.py create mode 100644 src/brightdata/datasets/x_twitter/__init__.py create mode 100644 src/brightdata/datasets/x_twitter/posts.py create mode 100644 src/brightdata/datasets/x_twitter/profiles.py create mode 100644 src/brightdata/datasets/yahoo_finance/__init__.py create mode 100644 src/brightdata/datasets/yahoo_finance/businesses.py create mode 100644 src/brightdata/datasets/zillow/price_history.py create mode 100644 src/brightdata/datasets/zoopla/__init__.py create mode 100644 src/brightdata/datasets/zoopla/properties.py create mode 100644 src/brightdata/scraper_studio/__init__.py create mode 100644 src/brightdata/scraper_studio/client.py create mode 100644 src/brightdata/scraper_studio/models.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 5bf1e79..e7b4f34 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,13 @@ # Bright Data Python SDK Changelog +## Version 2.2.2 - Browser API, Scraper Studio, 175 Datasets + +- **Browser API**: Connect to cloud Chrome via CDP WebSocket. SDK builds the `wss://` URL, you connect with Playwright/Puppeteer (`client.browser.get_connect_url()`) +- **Scraper Studio**: Trigger and fetch results from custom scrapers built in Bright Data's IDE (`client.scraper_studio.run()`) +- **75 more datasets**: Agoda, AutoZone, BBC, Best Buy, Bluesky, Booking, Costco, eBay, Etsy, GitHub, Google News/Play/Shopping, Home Depot, Kroger, Lowe's, Macy's, Microcenter, Ozon, Quora, Realtor, Reddit, Snapchat, TikTok Shop, Tokopedia, Vimeo, Wayfair, Wikipedia, Wildberries, X/Twitter, Yahoo Finance, Zoopla, and more — **175 total** + +--- + ## Version 2.2.1 - 100 Datasets API ### ✨ New Features diff --git a/notebooks/04_web_unlocker.ipynb b/notebooks/04_web_unlocker.ipynb index 8bc5b7a..8160908 100644 --- a/notebooks/04_web_unlocker.ipynb +++ b/notebooks/04_web_unlocker.ipynb @@ -326,13 +326,13 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "name": "python", - "version": "3.11.0" + "version": "3.11.10" } }, "nbformat": 4, diff --git a/notebooks/05_scraper_studio.ipynb b/notebooks/05_scraper_studio.ipynb new file mode 100644 index 0000000..658e3d7 --- /dev/null +++ b/notebooks/05_scraper_studio.ipynb @@ -0,0 +1,313 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Scraper Studio - Custom Scrapers via SDK\n", + "\n", + "Trigger and fetch results from your custom scrapers built via Bright Data's Scraper Studio (AI Agent, IDE, or templates).\n", + "\n", + "## What You'll Learn\n", + "1. Setup and authentication\n", + "2. Trigger a custom scraper\n", + "3. Fetch results when ready\n", + "4. Check job status\n", + "5. Multiple inputs\n", + "\n", + "---\n", + "\n", + "## Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "API Token: 7011787d-2...3336\n", + "Setup complete!\n" + ] + } + ], + "source": [ + "import os\n", + "from dotenv import load_dotenv\n", + "load_dotenv()\n", + "\n", + "API_TOKEN = os.getenv(\"BRIGHTDATA_API_TOKEN\")\n", + "if not API_TOKEN:\n", + " raise ValueError(\"Set BRIGHTDATA_API_TOKEN in .env file\")\n", + "\n", + "print(f\"API Token: {API_TOKEN[:10]}...{API_TOKEN[-4:]}\")\n", + "print(\"Setup complete!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Initialize Client" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Client initialized\n" + ] + } + ], + "source": [ + "from brightdata import BrightDataClient\n", + "\n", + "client = BrightDataClient(token=API_TOKEN)\n", + "await client.__aenter__()\n", + "\n", + "# Your collector ID from Scraper Studio dashboard\n", + "COLLECTOR_ID = \"c_mly0sa6x10hshxi8jb\" # Replace with your collector ID\n", + "\n", + "print(\"Client initialized\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## Single URL - Trigger\n", + "\n", + "Trigger the scraper. Returns immediately with a job object containing the `response_id`." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Job triggered: d2t1771835182154rujjlatrcl4o\n" + ] + } + ], + "source": [ + "# Trigger - returns immediately\n", + "job = await client.scraper_studio.trigger(\n", + " collector=COLLECTOR_ID,\n", + " input={\"url\": \"https://www.sahibinden.com/ilan/emlak-konut-satilik-golden-gate-1287846580/detay\"},\n", + ")\n", + "print(f\"Job triggered: {job.response_id}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Single URL - Fetch\n", + "\n", + "Try to fetch the result. If not ready yet, re-run this cell." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Got 1 record(s)\n", + " title: GOLDEN GATE\n", + " price: {'value': 6600000, 'currency': 'TRY', 'symbol': '₺'}\n", + " property_size: 100\n", + " room_count: 3\n", + " building_age: 6-10 arası\n" + ] + } + ], + "source": [ + "# Fetch - single attempt, re-run if not ready\n", + "try:\n", + " data = await job.fetch()\n", + " print(f\"Got {len(data)} record(s)\")\n", + " for record in data:\n", + " for key, value in list(record.items())[:5]:\n", + " print(f\" {key}: {value}\")\n", + "except Exception as e:\n", + " print(f\"Not ready yet: {e}\\nRe-run this cell in a few seconds.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## Check Job Status\n", + "\n", + "Check the status of a previously triggered job using its job ID (from the Scraper Studio dashboard)." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Job ID: j_mly4pzxd1mj4u0gjj8\n", + "Status: done\n", + "Collector: c_mly0sa6x10hshxi8jb\n", + "Inputs: 1\n", + "Lines: 1\n", + "Success rate: 1\n", + "Job time: 106996ms\n" + ] + } + ], + "source": [ + "# Check status of a known job\n", + "JOB_ID = \"j_mly4pzxd1mj4u0gjj8\" # Replace with your job ID\n", + "\n", + "info = await client.scraper_studio.status(job_id=JOB_ID)\n", + "\n", + "print(f\"Job ID: {info.id}\")\n", + "print(f\"Status: {info.status}\")\n", + "print(f\"Collector: {info.collector}\")\n", + "print(f\"Inputs: {info.inputs}\")\n", + "print(f\"Lines: {info.lines}\")\n", + "print(f\"Success rate: {info.success_rate}\")\n", + "print(f\"Job time: {info.job_time}ms\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## Multiple Inputs\n", + "\n", + "`run()` accepts a list of inputs, triggers each, polls, and returns combined results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "urls = [\n", + " {\"url\": \"https://www.sahibinden.com/ilan/emlak-konut-satilik-golden-gate-1287846580/detay\"},\n", + " {\"url\": \"https://www.sahibinden.com/ilan/emlak-konut-satilik-golden-gate-1287846581/detay\"},\n", + "]\n", + "\n", + "multi_data = await client.scraper_studio.run(\n", + " collector=COLLECTOR_ID,\n", + " input=urls,\n", + " timeout=300,\n", + ")\n", + "\n", + "print(f\"Got {len(multi_data)} total record(s)\")\n", + "for record in multi_data:\n", + " print(f\" - {record.get('title', 'N/A')}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## Save Results\n", + "\n", + "Save the scraped data to a JSON file." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "\n", + "with open(\"scraper_studio_results.json\", \"w\", encoding=\"utf-8\") as f:\n", + " json.dump(data, f, indent=2, ensure_ascii=False)\n", + "\n", + "print(f\"Saved {len(data)} record(s) to scraper_studio_results.json\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## Cleanup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "await client.__aexit__(None, None, None)\n", + "print(\"Client closed.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## Summary\n", + "\n", + "| Method | What it does |\n", + "|--------|-------------|\n", + "| `client.scraper_studio.run(collector, input)` | Trigger + poll + return data |\n", + "| `client.scraper_studio.trigger(collector, input)` | Trigger only, returns job object |\n", + "| `job.fetch()` | Single fetch attempt |\n", + "| `job.wait_and_fetch(timeout)` | Poll until data arrives |\n", + "| `client.scraper_studio.status(job_id)` | Check job status |\n", + "| `client.scraper_studio.fetch(response_id)` | Fetch results by response_id |\n", + "\n", + "## Resources\n", + "\n", + "- [Scraper Studio Dashboard](https://brightdata.com/cp/data_collector)\n", + "- [API Reference](https://docs.brightdata.com/api-reference/scraper-studio-api/)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/06_browser_api.ipynb b/notebooks/06_browser_api.ipynb new file mode 100644 index 0000000..24bf709 --- /dev/null +++ b/notebooks/06_browser_api.ipynb @@ -0,0 +1,518 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 🌐 Browser API - Cloud Chrome via CDP\n", + "\n", + "Connect to Bright Data's cloud-hosted Chrome browsers via the Chrome DevTools Protocol (CDP).\n", + "\n", + "The SDK builds the WebSocket connection URL; you connect with Playwright, Puppeteer, or Selenium.\n", + "\n", + "**What you need:**\n", + "- `BRIGHTDATA_API_TOKEN` (for other SDK services)\n", + "- `BRIGHTDATA_BROWSERAPI_USERNAME` and `BRIGHTDATA_BROWSERAPI_PASSWORD` (from your Browser API zone)\n", + "- `playwright` installed (`pip install playwright && playwright install chromium`)\n", + "\n", + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "API Token: 7011787d-2...3336\n", + "Browser Username: brd-customer-hl_1cdf...\n", + "Browser Password: ************\n", + "\n", + "Setup complete!\n" + ] + } + ], + "source": [ + "import os\n", + "from dotenv import load_dotenv\n", + "load_dotenv()\n", + "\n", + "API_TOKEN = os.getenv(\"BRIGHTDATA_API_TOKEN\")\n", + "BROWSER_USER = os.getenv(\"BRIGHTDATA_BROWSERAPI_USERNAME\")\n", + "BROWSER_PASS = os.getenv(\"BRIGHTDATA_BROWSERAPI_PASSWORD\")\n", + "\n", + "if not API_TOKEN:\n", + " raise ValueError(\"Set BRIGHTDATA_API_TOKEN in .env file\")\n", + "if not BROWSER_USER or not BROWSER_PASS:\n", + " raise ValueError(\n", + " \"Set BRIGHTDATA_BROWSERAPI_USERNAME and BRIGHTDATA_BROWSERAPI_PASSWORD in .env file.\\n\"\n", + " \"Find credentials at: https://brightdata.com/cp/zones (Browser API zone > Overview tab)\"\n", + " )\n", + "\n", + "print(f\"API Token: {API_TOKEN[:10]}...{API_TOKEN[-4:]}\")\n", + "print(f\"Browser Username: {BROWSER_USER[:20]}...\")\n", + "print(f\"Browser Password: {'*' * len(BROWSER_PASS)}\")\n", + "print(\"\\nSetup complete!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Initialize Client" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Client initialized\n", + "Browser API ready: True\n" + ] + } + ], + "source": [ + "from brightdata import BrightDataClient\n", + "\n", + "# Credentials load automatically from env vars\n", + "client = BrightDataClient()\n", + "\n", + "print(\"Client initialized\")\n", + "print(f\"Browser API ready: {client.browser is not None}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Test 1: Get Connection URL\n", + "\n", + "Build the CDP WebSocket URL. This is what you pass to Playwright's `connect_over_cdp()`." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Connection URL: wss://brd-customer-hl_1cdf8003-zone-scraping_browser1:****@brd.superproxy.io:9222\n", + "\n", + "Protocol: wss://\n", + "Host: brd.superproxy.io\n", + "Port: 9222\n" + ] + } + ], + "source": [ + "# Basic URL (no geo-targeting)\n", + "url = client.browser.get_connect_url()\n", + "\n", + "# Mask password in output for notebook demo only\n", + "display_url = url.replace(BROWSER_PASS, \"****\")\n", + "print(f\"Connection URL: {display_url}\")\n", + "print(f\"\\nProtocol: wss://\")\n", + "print(f\"Host: brd.superproxy.io\")\n", + "print(f\"Port: 9222\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Test 2: Geo-Targeted URL\n", + "\n", + "Append a country code to route the browser through a specific country." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Geo-targeted connection URLs:\n", + "\n", + " US: wss://brd-customer-hl_1cdf8003-zone-scraping_browser1-country-us:****@brd.superproxy.io:9222\n", + " GB: wss://brd-customer-hl_1cdf8003-zone-scraping_browser1-country-gb:****@brd.superproxy.io:9222\n", + " DE: wss://brd-customer-hl_1cdf8003-zone-scraping_browser1-country-de:****@brd.superproxy.io:9222\n", + " JP: wss://brd-customer-hl_1cdf8003-zone-scraping_browser1-country-jp:****@brd.superproxy.io:9222\n" + ] + } + ], + "source": [ + "countries = [\"us\", \"gb\", \"de\", \"jp\"]\n", + "\n", + "print(\"Geo-targeted connection URLs:\\n\")\n", + "for country in countries:\n", + " url = client.browser.get_connect_url(country=country)\n", + " display_url = url.replace(BROWSER_PASS, \"****\")\n", + " print(f\" {country.upper()}: {display_url}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Test 3: Connect with Playwright\n", + "\n", + "Use the connection URL to open a real cloud browser, navigate to a page, and extract the HTML." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Connecting to cloud browser...\n", + "Page title: Example Domain\n", + "HTML length: 528 chars\n", + "\n", + "First 300 chars:\n", + "Example Domain" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from pathlib import Path\n", + "from IPython.display import Image, display\n", + "\n", + "url = client.browser.get_connect_url()\n", + "screenshot_path = Path.cwd() / \"browser_screenshot.png\"\n", + "\n", + "print(\"Taking screenshot of example.com...\")\n", + "async with async_playwright() as pw:\n", + " browser = await pw.chromium.connect_over_cdp(url)\n", + " context = browser.contexts[0] if browser.contexts else await browser.new_context()\n", + " page = context.pages[0] if context.pages else await context.new_page()\n", + "\n", + " await page.goto(\"https://example.com\", wait_until=\"domcontentloaded\")\n", + " await page.screenshot(path=str(screenshot_path))\n", + "\n", + " await browser.close()\n", + "\n", + "print(f\"Screenshot saved: {screenshot_path}\")\n", + "display(Image(filename=str(screenshot_path)))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Test 5: Geo-Targeted Scrape\n", + "\n", + "Connect through a specific country and scrape content that varies by location." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Connecting through US proxy...\n", + "Target: https://www.whatismyip.com/\n", + "\n", + "Page title: What Is My IP Address? See Your Public IPv4 & IPv6\n", + "\n", + "Body text (first 500 chars):\n", + "WhatIsMyIP.com🔍NewsPricingAPISign UpLoginHelp☰What Is My IP?IP Address LookupIP WHOIS LookupSpeed TestIP ToolsLookupsIP Address LookupIP WHOIS LookupASN LookupHostname LookupDNS & NetworkSpeed TestDNS LookupReverse DNS LookupPort ScannerSubnet CalculatorCIDR CalculatorUtilitiesRandom IP GeneratorServer Headers CheckAPIPrivacy & SecurityP\n" + ] + } + ], + "source": [ + "TARGET_URL = \"https://www.whatismyip.com/\"\n", + "\n", + "url = client.browser.get_connect_url(country=\"us\")\n", + "\n", + "print(f\"Connecting through US proxy...\")\n", + "print(f\"Target: {TARGET_URL}\\n\")\n", + "\n", + "async with async_playwright() as pw:\n", + " browser = await pw.chromium.connect_over_cdp(url)\n", + " context = browser.contexts[0] if browser.contexts else await browser.new_context()\n", + " page = context.pages[0] if context.pages else await context.new_page()\n", + "\n", + " await page.goto(TARGET_URL, wait_until=\"domcontentloaded\")\n", + " await page.wait_for_timeout(3000) # Wait for JS to render\n", + "\n", + " title = await page.title()\n", + " content = await page.text_content(\"body\")\n", + "\n", + " await browser.close()\n", + "\n", + "print(f\"Page title: {title}\")\n", + "print(f\"\\nBody text (first 500 chars):\\n{content[:500] if content else 'N/A'}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Test 6: Multiple Pages Sequentially\n", + "\n", + "Open multiple pages in one session.\n", + "\n", + "**Note:** Bright Data's Browser API has a per-connection navigation limit.\n", + "For multiple URLs, use a fresh connection for each." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Scraping: https://example.com\n", + " Title: Example Domain (528 chars)\n", + "Scraping: https://httpbin.org/html\n", + " Title: (3735 chars)\n", + "Scraping: https://jsonplaceholder.typicode.com\n", + " Title: Suspected phishing site | Cloudflare (4800 chars)\n", + "\n", + "Scraped 3 pages successfully.\n" + ] + } + ], + "source": [ + "urls = [\n", + " \"https://example.com\",\n", + " \"https://httpbin.org/html\",\n", + " \"https://jsonplaceholder.typicode.com\",\n", + "]\n", + "\n", + "results = []\n", + "\n", + "for target in urls:\n", + " connect_url = client.browser.get_connect_url()\n", + " print(f\"Scraping: {target}\")\n", + "\n", + " async with async_playwright() as pw:\n", + " browser = await pw.chromium.connect_over_cdp(connect_url)\n", + " context = browser.contexts[0] if browser.contexts else await browser.new_context()\n", + " page = context.pages[0] if context.pages else await context.new_page()\n", + "\n", + " await page.goto(target, wait_until=\"domcontentloaded\")\n", + " title = await page.title()\n", + " html = await page.content()\n", + "\n", + " await browser.close()\n", + "\n", + " results.append({\"url\": target, \"title\": title, \"html_length\": len(html)})\n", + " print(f\" Title: {title} ({len(html)} chars)\")\n", + "\n", + "print(f\"\\nScraped {len(results)} pages successfully.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Test 7: Export Results to JSON" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Exported to: /Users/ns/Desktop/projects/sdk-python/notebooks/browser_results.json\n", + "Pages scraped: 3\n" + ] + } + ], + "source": [ + "import json\n", + "from pathlib import Path\n", + "\n", + "if results:\n", + " output_file = Path.cwd() / \"browser_results.json\"\n", + "\n", + " with open(output_file, \"w\") as f:\n", + " json.dump(results, f, indent=2)\n", + "\n", + " print(f\"Exported to: {output_file}\")\n", + " print(f\"Pages scraped: {len(results)}\")\n", + "else:\n", + " print(\"No results to export\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Summary\n", + "\n", + "### SDK Methods\n", + "\n", + "| Method | Description |\n", + "|--------|-------------|\n", + "| `client.browser.get_connect_url()` | Get CDP WebSocket URL |\n", + "| `client.browser.get_connect_url(country=\"us\")` | Get geo-targeted URL |\n", + "\n", + "### Parameters\n", + "\n", + "| Parameter | Description | Default |\n", + "|-----------|-------------|--------|\n", + "| `browser_username` | Browser API username | `BRIGHTDATA_BROWSERAPI_USERNAME` env var |\n", + "| `browser_password` | Browser API password | `BRIGHTDATA_BROWSERAPI_PASSWORD` env var |\n", + "| `browser_host` | Proxy host | `brd.superproxy.io` |\n", + "| `browser_port` | Proxy port | `9222` |\n", + "| `country` | 2-letter country code for geo-targeting | `None` |\n", + "\n", + "### Connection URL Format\n", + "\n", + "```\n", + "wss://{username}:{password}@brd.superproxy.io:9222\n", + "```\n", + "\n", + "With geo-targeting:\n", + "```\n", + "wss://{username}-country-{code}:{password}@brd.superproxy.io:9222\n", + "```\n", + "\n", + "### Connecting with Playwright\n", + "\n", + "```python\n", + "from playwright.async_api import async_playwright\n", + "\n", + "url = client.browser.get_connect_url(country=\"us\")\n", + "\n", + "async with async_playwright() as pw:\n", + " browser = await pw.chromium.connect_over_cdp(url)\n", + " context = browser.contexts[0] if browser.contexts else await browser.new_context()\n", + " page = context.pages[0] if context.pages else await context.new_page()\n", + " await page.goto(\"https://example.com\")\n", + " html = await page.content()\n", + " await browser.close()\n", + "```\n", + "\n", + "### Tips\n", + "\n", + "- Each CDP connection has a limited navigation count — use one connection per URL for reliability\n", + "- Use `wait_until=\"domcontentloaded\"` for faster page loads when you don't need all resources\n", + "- Add `page.wait_for_timeout(ms)` if the page relies on JavaScript rendering\n", + "- Playwright must be installed separately: `pip install playwright && playwright install chromium`" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv (3.11.10)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/browser_screenshot.png b/notebooks/browser_screenshot.png new file mode 100644 index 0000000000000000000000000000000000000000..ce384c4b3263029fb3a10d8bf6ce818441e827ad GIT binary patch literal 26998 zcmeFZXH*kd*EY-yqj;;pj0KcXoM9BDDoAfSfG8a)0YX6OJxG86AwhHi0RfRFEhtr5 zA|*r!2_gd0rG@|ry@x<(frP*t?`J*#-yh$fZ>{&&St~0!Ic@KKooiqF#9Npd@}Cts z%frLN|LCFqV;-K<7kGGhFP!<6`w0)Di^{|EJI^Ei`_^H3G)|O7{&KX`0qRkep6L5w z(I(z)yN|=~ch8s#J}!GY^fFlL6Cn|y>P45`kTbSX>djZ2kT#CeuK>^MzVI>ryf3nm zzD1CqoMAzb(;|6z8o7})@QpvfOS|-PV-?{3vI$lHEL$785Y^|2mFDqwS*fGhPef97FjmP&evHbp3>Ad;OF+`;39eOuTeTlRictve<1ab zw?B7JwAE9;fBlw0MJE#4lmJ=zqMw6HYSv=33u<|;W5$PvxlXl6O6AuEx2K^l5kX5! zyWe&rt3C$9$Y`*;yQaj#pNv&9ps2{zRW%D(2iqj(#{h)iT@e#&QPhCuNL1UikVeWv{c}mP8mPhlbu( z)#xr}v8?u(Yk}$2tqjKMD*3Q|vC!6PLB)jjb&e+jGuJ6`hh~ zYVN6O<797N6QdFC=D$ib7!b#h+neVoi;Ihw@q0GE@;u+T0BF7FJIhIwa^4Ud2w$@f zA8zcQipv@hARwDVo*-_gIwNW;Ia@bs?rzFbRb?@2v7teDU#{9^pJOa0MghRb(?=TeG|1Lz z(BYP@8{IdiSCK`PQNe9zFh|`TR?}YLKloBh%BT`68;y70~l~XDu zO=eNcDY#^F)EZzQJltcY`@qkX0d5@pn#5$WS@MEZ@DDpJOZ&-@;NlO+0gCtV*B_0U zW5*#GsE@w*uqNV@H3AB)%6^-+4$O3>hE;IFI70$uQt04@0>G7lYT8G1O4K}Eh4J3s zd%s&X1uI1&6jd>%4y`Ip!*)D=j1FRJ+~O-r=cPbveXqHJ6Xa>+7v^$VTrH@%MvIAM zz~C6O;>{yX#JrJtfM=%uM$NnoRG`vGGdJY?L6r!??1kGDGl+}z81eX+Am`@>CCXbETJ|uWl=lZo?Vl=I%O(v;;0Tg z`d%|k;cb18cgYeQpgBsslN~QN zx<>}q@!f^RqjrR=8I2l3f%q_&UGhGG;_u+cP4I5@H(`R2b=$~_8^KlFf^!#V++`E1k7 z@2{E6X1F9`)6|i+BG%`}VrI8O!CAPgqJ07(goucp$8x!%D`C~kcXXz!!lK$*YxyyK z<^yunLtw;QP-pqRg;T>|^&oNIY9nhCoL1A$zzq<`aOAZ@*TGR}d1VnaETHak?{#>D zmEi9Ke&yWh?EYw1WvM_;r@e%L8yN@x=O!)_qgT-n@os-~?xFZHA&X_LKM8H3~;z39U%@h0v{RWmXI9vohG zKb4pU%aB#lxUIH7h3X9c$ld%$k}5Bx950EARRX-7NyE#wYmXrn`=b@Tqtxgs6=u~mfHBn2B^@F#1A#4^ZM+W2@2A~&&#dJdq5k1YY`~s4bcpJALcy+ z1Wx)!n`vDS!35Xq6o7laVj&@U#5ZX%Zl#8h{`UPTrJYCCEs>>rF$m+Q1`$3V+1Q4u zWGSfLrhxf~IQ22Cg8YGsYTY~aa0I1AYS^ag4UkBAnxjoZ((WCY-ITijHRaW2Df zVnDQ#yzsm~30z<%;h-xqD(Pp;w4{cWat=PwJPKDZJGSQZwAYz6FAiwR#L|F42McEH`VLiV5SL5VK#vN;=CMRTY&E zE#19+>uHHKaPCZe{BV1XZ~|8=d*G$+!!cB(WpTOzmTXRxpS9nhMl01zcu z=eE!@KyBK>P4XDH7Vr_JCUXK`z?YeowX<6rQx4b^1UDX4CoVf}M#^MdY$+BN-VBjJ zeXPJZw6Nw9*0ztP47vqW=*}e0bxc)l5U~H(?A;@RN~-hd$mgdkmgx^gt9tv+FO_yK zHN2(ECZ{X}qq?S@ak@BO6DJqoqt>P<^OO`*-O@);MV!i#JL83zvAS6Vi}k~f?kwFx zwX3TESUz(P=*ZYOe>WJqqyBHV@0#?Ojk}ZgD0OJ+`gSEi04?#**3sT76@Cw*K~~=rR{u-6X<``;y-#yuh!+GhgKF2Z$Tf!o`p`0HU%hh zZ7}{ImZ`7cccm+n9C@*X-E#p3fdLJ?Wj#NV=cYJP$;s_PrJX@b7ZNBU;guT)lj@UZ z17-{73qg7I&4vu|eSvaP8JrwN2*WCc~XoDd!L-E?48P)ylu8Nws5dQn6CT$ z0p2HBr=}jHgzII1;aOI>y>qcaOH!fN=RRaCET6tVsHwU+lqNioIKB6n-U6G_4wU~) zR!%|7Q93&^g_NVixJWzfHrF7_3~oR*g?g2jI$}G1g?7m{N3HHwMzfE?3ae@>8tCZz z9unr+?fkrNLoY4(UpIK@DgHO_>6c!!|H}nLh`JUGT?DdJbfo-&i}~^fnn5R*ri*j? zI||!(Lci%Knm+(>k(T+Ngru3_r?UdJ`Q*8Q*5j0IO5PA=7kg|{(W z4Sgy@F*CS99yfO?e(KH2{*g=IbJAQ#!g#8)#nvC%CbWy=nOp;O=WfegyR%6_ z0aH6WN;vv!U@om?9E4+ji^h7t86GZ|A(4_i6M=-0i0v(w`WuLx&bMK-_{-kizU=n# z*vZNXU5c z&A7*S+cgcAZ0(_;ti$|D&j*av2SVLsVPP)M>{p8f8(-*z`~QcB$IktYh}I!pAgRJh z1JSJF`Rgj<7)@nvYPrZNsj3r*iN=nxfJOBMgXw>0aA)1gxVN$1a-N2M#>QkeQ2MBMwfddv@f(Yalj16m^u)imq3A5+w$ z3IO5UVd&i-z6fl|@6~p6x4~Wpy&4dYF@M6{GalO&|KGSsU>+DDpB$t4`Vy_eKN7k3 zboU%70;*SJXYUF{m>L-9IoC4#zrXJdo6aR*F%lqnEW)faCp#5p=;zS#3J6uA6tPnW5C`7Z1BO74vE9PQ7OC`CYvRfA%aaj31b6MML5z<{QOAw!3`* z74k{w+|{G8K+q;*%SPzl?tPY{FPQ$knBKWQ3>pEy+rF4~- znj@kS8b&$5Fj~)*ot~D)On_{2ZoZP9TRJ#cS$Hg(<_zEZ;L%1@*7%_sF~P`Kkt?1yN-#(ZiIG|!tiav5QHF>8emHHrt zFFDVz{A}-7e)c){2k~oJlV)R4|JL0_yq)B-hmjN9_3gr55?Sm|O8cle5$1Vi0@bv% z+zq7$$&3zbzsVPog1`_^t3wZS0U6)OMQemT&9$ToUDMz;+>$v$3|b27+mB6M+&~hs zMb$Nm;%uyuwK6e?xSZswoclmKhMnAyuW65q;eBk&>|Xa*ps~+%mbL4||JSIv_kO9- zuWW5Fq49%dbH(bz4VI-tOopl9dQjObqH<&n+ek%CTIa{qAhNj0t!)F@T->lY)i}?W zi0pgX4Gy)woA>%36n-u~?2$_7YQ)ANMes-K4I?eHe0=b6h9q`zR{T!<_C!b4KrpI` zvNnU0d$lZO)Gbr#1ITf3QimQ7m8hC&w`?M#fed0>nZhyCGuhf-aTbQO51H>uHWoJ5 zZCNhrdz~3MG{If1PjisoK16Vl>~r(?yEN=V#>jYNl^4+4z`$k^HJ7q~Yxn7Dd{1?! zhSmDxn(fRTupx8NU@`v+K%yY;ol~!jG{vT|B?vZKbZe_Sq0Do6INYGUqy!(Dk+E@P zXZ;{~_bY98ccC}f4SC1b!rH;X*4C+pJ^yS(C-diOji-;FIrw`*>nAlj^9SQ^<)38e z{|yQDW%^xiZsISIL&ctu*QW9wW5;3&H>=0%loU?$@O&xy>wmb`FW3IBJN*A2ANzcu zp`oGUxzv4D$KpvT=VTAliDV1<94{?zHqOwnCyUd^?I}7|O?Wkh{qN@()!grVZOJ~` zsK89&e{T^#cTo>`oYmrVl;jyIr1Rp((O|WlHiO!7fDwSQ3HlIBmv6NWA1dQlCG6x@e`k&Ayi_AAUFBK5y2RL#PD=|z3 zEbIsdTr#SNidvRD2@LvdVjb4rV&@C2;M!N{|TwoXv6=HMt<6}c{+NW1B+hTPsl zeeIoF2=fV6(aC+(>B2nNb$+;4k7EvelUW`p&}>@%Bg(Q{mMUtRB>y1h72>E^in`&4 zf7AWC48l8hZ`$P^heO#LhlR2zbn~ z1WW0bm%asK@=f~}fzxy!-DdXFmR79^thV$y5^~G=ZLm8tXu4v0b?2lBUeyR zu4QNWJKQV9$NyXrFT&EpdTjg|-Ma50B4x{Ky@;j6r72Ja!y zzUY9mb;&iU>k9;K42!~6>g>Pzx!fXF<&&_&73>l|BUe0}Mc zO6K&Jq3lmF4ZFWrs}h2SwB91XGJW4{6>bPRZO71}qdxxwahSHs6X~zR0SaSXQ+Y zTvcH}4`tgZlOXMpmQVfKNzOcn>#7xG9b(NQg1oMU$T$uDumF)hsfq*41+*dqE*&3` zhcvSs2s3AIKjcTEGYEz=hdmhi!wX8 zrDfFhb^0nmv;YKtgba7}?JS3zcQyvj=cDL2su;=<^-0s|@o3=g%JIxylhyJMJ=$4R zVvd8oija9aP*v1l{c%e`VFsDmFRH11rP5cER6jl({wy&3udT30{?PTVQr`A#-|00q zwntEqJMW>5p&3q@R0T-74c`bdH=q8pwUGbUf0burW=nmlTM zNeE3Crf|mYt(k#uR5&9E8!_6xAT#r35(!$?_sv+kQV>aKJUrBnnfzGecUg2@ywXpd z*-f=(0RYcf3+kEYH`0j1r!6mIThnoN{& z!^tBsyg_u-pW?X=Lt_b3OPMHw%J0gr7q%AGh>KY^x=a^iv*ECJI|1(emaTaBnAk!0 z1WPzKS5VI8mY}j1ny?rI$qf&V9ZN*E&T#$&s6uI?(uo6DV2-mEHsjn9l zz8e}`P!Vq)S-CeJGqP=3J|T4_`TqrHDNEdH*`6RTB*!MBfvN@fAYg2| z01QSUn-p>x=qv{TiSGpQX{O{8aE;8{d@%xf%z_-mm>L`!Gz@Z}gqL=H)gPInTO&g; zGAZ>fNWaB%{REx@LIq)!zSd0T&f8zUq;IKua$FJKB?l*LZA@SYt7oSfhvqJ^E591p%|D$Qa7Gle$n znsvru;kU_@hR^eno#&V+z%@tjGQ81%0>vr7Cc@=?ksQ8g5o;Q^URjtR3OF_k3P9HV zr*FPw(I#UxdzC@s4@KG^(qA;&SsGYIvNMDdL*FB|BJSIO^&$i4AEX-c5(HYwt=VB* z@J?DU6v=KuHN9w;Zfgl=pchMWz~N-x(JY##rb-s6v057R)?Regza?0E zb7l~jQxsA{C5))%t!WQCjf5BQGVMg3axvpRP%{lWXlmlxW6XwK{w4%TKJRq30Qn|P z&)vf?VuHP71RxGJsH)^f;ys_#pU`}>?VxiT(9Kd`?lr?sBv^G920w){?2r}H=#=Z6ETi8D8hNZ2O zh;rT0_FRcaCQ5W=6_7ltf*ncUl$N$@&W^-Gu8mGo#bxY?rV7%N&g#0_nkwQHt#8KN zGPSDz9Lo6Ly{64DGhBZ>T<#yy*ic)4O(hDtVB{P%O&<9k2JeiY0*&$MY}yJ&?*FcA zz1f{>Mwle2j>97&pM^Ue&gu1a0v?zN>T8ldMJ-bAXlTOalRS*pl5r#Q(}~lNX;Q|H z57Rddx35CjcJZbyjS^9<*3-WJHDg8D#^AQ_J%yIXU{SpK?taP23~|gpf+)%SB_q3i zkV=y0AFx#+PFdTC_hrDx@I;a#`MsMomQ@wN9wK zQMi$#F^AxAN@~$y+5B8zWtIZ}J}lX+b4(kyhU5f@omN*zW+pjeVkM^r<4n42RW8~X53J74+Jy{vEfEHl zl_zFVtxr@{%c6lQ@B8qZ)OV(Ua^OSx6)%IZilCJ#JNV5`GPr@HCULMM^S4pb+ak;JAZM!_;hsq_7^fShrmpoSN2C62 z_ma&B(jPM5C9SNc0rx@#q@zR{74HhmczfA({p&H^5*C5A6sNSvu*~ivw43(NhvG^f zm=md?2jcD^|2^Z&7PyGfwQ&Ec;5g5}zm{cz~5kbNO zYjuvWGAxwh@04(0z7}qbRD66WJhU22U7D5_qHb<h>)fV7L2^IE)=Rn0O&{#iEaTz!>H&|8D{dI)H(Y zjVj@bj_VF3Hkm{b&igDHAg@ZhX;A^W+O=?R&9>_3Vdn(c(>k?tXw~#9u9>n|)<1?h z&O;t#gfJ^6tLqy_Msq`^)r9G8pTOku%dv$Duah>G%|~S+3qN+^Qrtn`j8@TmEsUit z$1WdC67APLEPJ59X9GZ6MW7@0z#tWMt;c&ohU{}a_)B7x+J=>tQmXhBM8Lq3;1N2|lp^oo&3OC&BNugV!*e|%U33ylJK#CFO|+Cub-Vl%?_tT$J_koUPnP&TwvP)>P_ zl|!kqs^-iC*@nqg#~-yqO7N9YicaKh*193EV#WdyVldg&KWj{Ri4|KHM?(w7K%X~b zh9S+Lrc$3YRUDca_Cv!jVEdMHd^SydBX2e->4aA(Xx^UtPUMYg=SwDxVHGc)Y^=G8 zVMeDbB`nfEhlKL?`>njQowfi{E}y!F0BbA5p%WdvDA*u)4lo$409mu^`5-VVUPV(Q z>x@U$V4gzuE$CI=I<~4qI^(3B0I+-TPA1A|e^+1b{hyP$e?pAeL}Ff>5==bU&e_WH zjkY6VV}sF+HMXfjMtR?Wmy2Av;@|6!P3xNNmfQe$+RndRaVz)I9nCh`+Y=xWY$cG* z`$8gL8WhyxIm#J_U9*F;j2OLk8P4rOC`3bOAjSR01GKs7hP}>(I@P$2+SLdC?G(>x zp`35kgA0*u`S^9#OZY`{1Ytt=u1(z#!eh;d#7+K^S(;G4YDquN&K{PmNr^_&a|mVY zxZ^3toSsy_c!DWm$0*c@#)1T+G(Kg>{1D1`*&s5 zxyFpsSMx>5nQt=4>}Mj4~BA0ztko>F{z@8~l{LH%Q`_v@UCRtC3?;GSu&>yPjZ(Lv^+o z0v4#4`O$a7(N}3NjwCD-cvPNAmsC2bX)-J4y24mY|Om05)c! zogQ7W--r4kWOI+1Yw3~x5mDm00c+G7P#tk+MpgEm3EO@*c?Na&aI{Zu4@MNYL{}fT z$S2&of>#$t>Ulhq-@=2a+W!>M=AZI)xYlf!X5|Q0;XPjky;&-)$lGjLEw4mmbt)ZV z`{Wu^AB2*HD{C+6?5FfDRW%pyx0hl!9l|v$3o5~^r23=ArR6cY{*F=QnsgB%M;zZ6 z=9#1%7gX@x%F)s)RU*+e#dLZ;uKhYdrn*o?v;1EF+NMICnl|9zmONzAyN%b{0cl_b zxP5(I)i^SJBxV_3y5(r^dVK0e=7WQ@>4B8y}eXZ?@9u5 z%qGj=kKu*A)kTi!^c9+MzkYjIqc_Fhkcw)O+`C=_as?Vf*B1YDfKdB+_Zm~aN>_a> ze{J@8wJSGqGXI0C$|xzT=`tpwW-+ke3*Vg2RbBaBtm%@OCTe8V4e{LykMLQU+%$WD zi|$OWTF0m3>h>QZ4vosPGlU(IN3ScE<08-|C7y8+3IhEZcSRxP~H1uM60AK#~Nj1@axj^o-5s$j>CMXF zoTZ>};SLtN&`$-Vz!&sS*&MHFAOH25#+yxC)kB#^6J!T;Wn&YN>=3-ehz+^rF-wds z{ir^WBp@e7s8;?SGZ>At%C>9xkGKPArFOX^5qnr8*Ek)bsGUk{(YV-?sp_fqWR|$> zAduDBAng?5H_rxSQFWmRuf|!w9m)6g4UMb!u0fFBhkXn*Sj|l&)pzX3Jc>IzX(w8z zReaISYU<|4T@Ek78=o6% z`J~1bE!fINMRAVtNR56OigPX@t`U{v*fJVMK2F%Hd?lRW(SpA&T`@M@ws7634&ZQp z(MN6nC;wjvBC67CtH5kv5|4R z&PC^5$p*Sw8lnHe(i(@CE>wt9;uXeEH>!k?kg_JFh45}W%fApioV^E9!t8xAtLnqmaFe>!l5%ql;e@KW>6G6) z#@g5oHI$T%<7W`C%I<(%7ivEN3*T|Yby&Ho8}8mW5*O=ZfNH{?wa$f^QHldRwriP3 zz8n9O>wjlV9Hmh9JWhIt>?@6Y&AN^1H4~1DlG`)^G2ppy3%_b|Kah@1$TE))iN6_CG7b?s^}br%EI@5`KoWQ zDpXf)S5@e-*&&$rb;YXDP3{bN$?%>FB;@mT?I~}pH0BAg9lhcx(makhh4vi@ab&B! zb#l7Pq5^*(4=ns#d`+e4ou;ti{ri>y;q#iI-|Cz#F9Mjp;pLi}rUS&N5=_)J_4J(H z!w|-Co!V(>xt41Ro{OM2xVqzoRR*blH)$U$vEHMAL7-Q4tn}lwxrh3u^ef$pk(-(e z^I^U{Q@)foj;Hxwe!$!=UNk@y~mHy4Z>%W-m? z{vv_i>vzH{{y3q38J+Fo*X^372Kd!a-9CGvr(jcA;^&<^NtdQwTihbs{5 z(h<+DOHFzC^3VJtuNGeX)G+IRxd3~~ zNUqjI|3*|)NIP%o153_Sh`IwhIwQYN7Mp(b4gKPaw%2`2`;o^UOG`x-Ve4%p;@=Rt z&K^ibczNzsnxEMD>TB8(23JA8Z+h^e3gr){2tADPu^S>pu1}Wk+8tW-B~h|42V)9d zhlkN^7X}7UElRko+FXC+Z&MsHq4q1MS2aP7hmQVTPfotie7ENG zdzlZrwgEd74@dd=vUa#?jlPv%FHD1lSiOnDZc)YW9GkPX-@$MRw00?$o~6NGyW!qu zeg~H>as@%2le&CAsqwwOHObAr(2zZW6v2IzxTsxtao3IMyIJ(_(12jlR6ob_210W` z^`SYq%j|k;ej<+H1~olIzt!KZ_H14r9XlphH|rt?q5>?b{W%xG2va=-1+4vYzKdAb z1)B~~JMQlCH&d_Y81*CyYYj>Ka!ycG8(uy;aSyiK9?h_hT8@tF3k{P<h_%hS-D`*ia0;sF>&2z==X+S=-g!(5%f!Z(#rM32$IM|xXl_}zt zT)SlEv-K%W$M*&6E@n@Xi}qJA;a{Zk#?RQgQdXV|E)X6 zNf$QP|L<>g^Mb)L?7%`$V^NuJa)TNFUU2Yk>1pbUEQS{L6wo^8T2)We?m3$4px?`W zk`>oHhlq1V>}++BnF_E_$AofG^iUVDtFluZ8bB2hRYr^3KaO zHq82u%@WQZR+lCd_v0glV!o3|dZ&e-LDooqIc;bB#j z@G>L5Gc-t876#4+2wwM0+#yH5IE*h`9FZobpI4Blc5Al`Ww-abo*1~4>{wcwy7mkr za>qI{MHapeE&43kHsC0+V5hv?$^B1M)9zfHYv=a&jijh}dFT$P8aat40t1Q5AWqeKj0ww}2xj7Z~*{|`R3egSC7**wWIcW1<{u`aH?ny~(AXCN3;=*Eh zHo5Dt9S{49tM4kX7rAidNZ^kYPnm0VYEalIx1}4)+K>2k2)+KlE!>;}Pag$Yh3^wR z-0BRIh0`+(bwj5`B$|k`@-o`0ifTDZRLPpkAXxp%!=MGU{i&g0$u8O!o1pA2i15rq z0VD`l6Emzwt0e4Fxj_I);_ST&SjsUc+FeG#@ZpF^IPO$RwX*Kz!CErukM5&h zFoCd2o9B4rQY0#qh&QH1#RmFUEG!;3%IIcZ#cODA20epdTpaefcERgO*JVNLxXFyH zk2e*P0>F>EHp~@>*%_Mu?%}a&sQf}Tqci(x^EhEo6>{1x0{K;b+{Q*o`Sp62W*QDn z2w96zWc{ganX2jA#rw#bHqd|euEyPnf`EU?KUU_ps<%tW8XCstM-S6@A^Yd6iJrOa zV+&ieyuDJib63M8IAgIjaOu9t7h46jn9EojcT3LoxVX3TIAh>4SXvf?HgW5Sjf+bo zdCpx1<;uu$o)AR1V9QotIGvH&+Jz5=gdVMZH<5L+B#43*Nvb9 z_nKdS(7cNF#1-5Zo#eBYD6Y(G@6=-fkt z{a+P>7U>TfZ)b~Uzea1@34k)Eh_#8NFQ!4w z?}O{2&5g~U_oei41>(~nW`7I-*aVM-dj6+Nrji{I@>i$~c^KjD!g)~W%!X5q++7y= zHghC*f8k0NFS#w>%+LF*)aJLJI+^uYViw}cEd(J2c`i6o!&*b(r?t^Wpj^@VvJ(zg zA&r|pq~f5Jx9}2;O0-hoz9gJ43+O=kq$^F?$^|1fmt`AnGq||hG*VynLQ2UWXt_fV z(wErP+2PeizF_|v0SU9G?L&GoH`w^X3m65%&|@@=Iz9BM$8m6Az*45Scc7}k1YR!L zS-mH<>8Rne2$u9xkvO-u7^pwh~mxazJpVb5jIx?D3}& zivP2SP=+P=WkMyYlA9)5DM8bjK{}E}H#q$n8Dlu?=|(6-&-5>NP}Ot_ZF~)-$|pXm z)OSusz&688DvsV&KKAi3KBD1Scmj1gs~Zuu+)lHa@e!18TyxyLb9m-+-JmbP^WOow zAJQeg0#XF*P<1CSu zx(JtAMtdx$9oLB#IDbC5qr#^$K+0;J$ki1`9=oj{M1rPg$n$A!ae}~2Ik`UL6fmN) zENa*G(MH$UlA+>Rsnq-g$!rs_+Tq&4AcmAlC5DI0JPvC5{5Naq68n{G@_xRK_NSs+ z`V6T;rby&!Q;_?$j7Qdnx|af=!VXuPTNo?B7wzO0HV#;wkInU1h_uOaGw4S*XudWZiJ%w9_2!OACwr3=f%=Hufs|6%% zXNje*NnK;djN|>Nx3tR2L<0w>E3IYej$EcnyT zw;ziWtdAK&a>)Wxm>X6~C8o>kzM=AQ4=f|zeblBlYnfcH_^}lG@t>lZl_!VgIdgN> zbhhLh=S;8d-9Ztf_TU(tnmQC~5))@=?91vyj%rYW0Y^IvJ;MQEb zy4!_~ei;rIP#N7XIqhlNPg$`@0ym0^m3MC+1kz=#8GbNa^z1CIiSH~*=3T+74FnoF z(1vKb1XPx&1O!N!?B(l3U@Oo>WO<4H<$Tc;8{7uQd1A5p?)RIj73$kAqd#J->m`)R zi<~jHxLH+xM{yH5+Y#HWzvCnx;l{sKNOp?EP84}iB7B<4%NuH+JlpO5g70FySZnIG z4)&gUt8d@bvT@+1XGJ+Ku~okPCMVHg`b;u*gzo4x=eqsJCt217;{cz)s)C&oURe9@dc1b7B|Lx*J zhONO>OP>Q4?5%4YOd;i6?|-v7x}ep;_Ts}QbP~2%I}QZuboQ56(WNL-1M5F{0KF}# zb9*p_z4OVQ-69iC>dl_R`8TWlB5Fdn{B}duM!MEb2XEnAL*2GEqGwl!CMIg#9tZzc zV9S+VkI3AtMBkDSGvPNWJ#Fh`WZg0qy1B_!kLFCaWQSK5LGF%5{fXUG%is#ZWvcty zIZ{yixMaJslv9fxc@p9McAR)j{T?Bln_R&!9ClMtw70^xnXkEJg5pluo1w-b2U`ig z$t}nmRK^2mp8#-T!r7s%J&vH_KYuk{4l8Fw7)Dj#v@cheZ7=4;=1}62aV;03epUv4 zV)}&wcX|NH1?U3exQ06M!)?Qyb#%*}ceDWW2gWuyv8lf$Z%)DLi|~aQ{Blt3=08E$ z-VA=ZvYcG;`{qU1>bnKr3M*>jq zL0cy0Lpw}W6hdXt+g0(JK(q9Z?V{?)BWg3%riz1f zfT70NamSaiS`75?w_c2o2DJ8B4f|kb$i$KfHbfqhKmZ6cB*a+_A9{%K)u9drKcW2R zh#Lv*q>Ty-D=Pf_!or>eVXxHGHUTFbGI{6OQF*6q8y7nc9~MhJpo--r%6IRDT+ThQ z#+@|geA}8?$!Hzy80D+jn3}dodnnL|-lO3 zMz1eLqv0ShGs(CtIV8oVAx{?;Z0;2NcV{iMS~%U-&(n(O&y#ZWqE2km=Rz*pjax{3 z$Ek^}lcZ1)S50%@KSD*0?A0_6(Vl*Ha_p>FqYZZE|qMseUlMX42fJH zi9c6_Pj;kfG=zWrymfqdv5e(W>AV^ch4l1$>f+}ouNe~5Kv<$c^jgMJo?j>1rfa9Q zNpL_1YLk&zrhgu@E|}bxek{_Gu3z9bc(n!!m-3F-f3W*!R=9qF_7|yq?t>6BmOV*H zE?d5ed;O}Q{u?|06{w~}7pm8!?w|bkw5*VEVYsQ z(8Nrdf9OhNCr+5DOR@kv**Z8lUID$kvbMRlCXVXr&uLcF*5dIi`1$OF4tY+~Mafvn z8n#Syddv=*>}tdFYw^z9kLUIyj&0T-C4rKAg>9}+VRkHUY6Z^C;nv>*PsgFTrJts| zPPR^sEu~E!#wjNWSz~vx--L~2&ej8YRBm5a`6=K2CF1}7y$i1F@-v1E*d#y?yRqcp z*Z5*j47U!gAFj&PTJrFiUm_$uXB#=Mud7is#bp4Ru;Y>FuUFRPiX$c_*`9M5JIe!N zl(e6ZaD+YPmX1sToxDR3A@=qm8?CirG%Gp6ZN@QNPt}9eA~YSc^~Amr-AnFf<5+2M3_A6UUzNqeJ+7rgb(_v~WE~_h_(ol=MROG$@Uu9BpiwHV&2Vz&>KJDZw#y|H%Byq^_Li2G znNz-{>;J2X)Pyp2Kg=!um9P7(0@A>8TV@otqS3_J(T(dBp#FMs3Q!bZusiZ&5oHW< zHZ`R>-0~`HZkqM#No7%DU=XGww5p}9QWsmB9rnmZ4{F@A!wxz}2;^2;!v^qeGL^6Q zs~e8nmzMN=HKWSLDeYSX8AD z)&JApl?FAHuIcV}p|7-JHwZQenQ0Z2Eh+-q2&ZiqS`i|!KF>#RKyJ8G(RUQ?6y-~mgZTRmlDli*se1pFN0Ql3D~(p-S94`6iLrUfdyP<7 zH7waSIpLdhbBC|OM^$2Qb8_Co$UnH4msK05IgUm@mx#E5!7KfQ;*UQ-sfFY0+H*1bur$UU?wdu;;W`6$idL;A#< zgTkDq^1iW)SArJaJ`6fir6~MSPIcU!89H;Ij{XSlgJ&v3B^OQId%4{w8!fQkFGZjQ zrzJuuJ1v?P+BbQs7Mr?=`)d7QdFkaHr}EN}jpSKnEn#iMY{b59FK>7Mjk%~GSdX+J zIL^v5k!i(9z8iA z3|rfT`6YUDo#$4ow55FGEq%`jDQSvkm{b7l2+>#e7fCbGHnfA&+(p*WIh9*W8&pNfd(%G=J4vLISnK$=mZ~ zjppc**irRv8oc{yLgW%_K!24HWU?%zCM05=9%nt zNXuJU6PQ9=Pw21MW%F=L8L6m1d?U7ElPEN-@g?-mpIUOL#(O-xg|i_YKKPb9HaKAA z3l&VMdODN4UJwXu$DKaF6P?7<_lBfmf&tP++y3tt)zLKszCUss5~t%;g9sM5T(q_g zOc+~=ZrPw$AwsirM#V{^tB6}&RnG?*v-tCH3^CD;hQV}L6zOO*MY>n}ja7TvBHE;5 zTWESssbrlOEcf-=lzI`pc(^Q-}O4?I_1xY1w`I7sYt9bK6r__;24o z`R{m3Ty#j01FqK|$mSOmMRFb;+CREDf80XH=k>YT6-6$;(2O_=Yv`ZtUEh+cGagfK zJ|GWl8=mKu4Ku;6fmZ9wiNfEnIc!IklggiNJDy=$!Liz;c5>E-(~5LOLA*7%zSa_G z^-!)*r&szLy0kNjW^AhdjvhUEDJtO#KVYDdjQ!3`v2%x&)or%^*4$2E!+$Zl`9@Ht zZ#~QIcIbt2nrT1HTrpll&^6Mx8mJ-rzNqLaaQWQ(Md#mW%}!?F3az=j?aABqLx0^S&rhU%H_ZzqnmPs)(GHH8#T-Kj0uw{Fm@e_kPW*;)0?(3f4-dYl+ zt%u}!8>ZVaUp7bye}vwa*Z*c47d0j>1>zIbH8#WoqPRu*xN^MiX>4>5vJ-_-&E1hx z!^Juc`%Tze-A$^>KnsQ9h;;qsx5(eihfNN;BwQg`w?;mW+qnyQGWc%{-X&DwpOo%5f4;;}-aT)~HaW5MWkjyV{Io*je0ezD{RPFi z3mPwb<0YN1fAg^Whp%s&ALNkH921WMY1Mt zQl^z`&TdNz;m;UxC~P%#HLufdA;pNXDr8Sn;&;b4r%}rHfB4?^eNR_cu3=n3b;I+~ z8-vo3Fv}Rm)M?MQVf4AaUwlsnW*Y0-=NsG8s~9H|B5`k-?=5mZpNZ2Ukfa!$Y^>kL;!SB``+0$bmeG64M`NouHPxj5+y_cF; zWxj4#L@*-#i#ylvwFW7w+PS@5LiEhozT}`R#%NW**0*#Hb*$w z*Ks$uF8X|nzG=px(W+AroevD=Vb6yt#0;D#&}k?pH^eV6NX( z@)ASI%zpOl=i@duFhJUUc-$nXJ)(Gz^y3-Mo2}ZPeKz+1_VE}+BCARsZ)^ScwFycA zTKl_h&qIw^T2%BC7|}G1qP(k7PhKzVbciqPU9Zxu3TVdSmKv!xW%I9I6C@4xQumN! z@gcG3inP|dA|dl$br;N&P--7LngiTQ``Wg5V?S_yt8s9+58EM4{)o;r%<8rG7Q?wO z_K;pd`&TisLCpJ}tE&6U8*KK*Qi&;&-&4#llA@v_!kcawB{^od*y|JPXz$hCCywV@ zl!h%UJuVH2;*pph?fxNgnvWrjv6XQ98V<{yp0~N2IB_Bza*ue>FquTycYe1-aLpHx9-HdVKn-O@pn4+Z!?%r;>teNfO;8Qia5 z)}a+;lNEPN7{)2mEl5RIbj&PxqEp7u;N6zeHsC0m>jMMO zXa?)zm6(0;wmA=XhJTMnS1$ywG3P@yjnXc!c~Mc{4AP|BI=F#UD4CChaXj2*Q_R~! zipSfo^C*r()iif8rm3m<))$vM`Ozwxv%md2;6SI;(kqzng$~}Rn$qjf zi_@G9!O9D-H4SxU`O#q6hat+&-hRuDq0L2lUQBL6WCN`PKH10Cnj57x4n)NX>fMgr zP(9?lFo?;UF`;~mk~u?Gth{HXmy0bJsGl;*viq@kT#T8g#=wJzzwd>(&aC~f!g_X|y>KA{lM22BuY6tr& z>gJ;8HFxan0e62oW1qyt#nsgfg&8RfA>i~Qn3OuvK3ZJUgbQ(Rx3>Ewo8&o@4kmtr z+v84$tswsN4iq+>8q7(d>1?85u!jlMWVUX|rd$bBPlU0H+^mqOgxxrn{ZamdJWAW^ zHgj3n84a&+@^L{`_1qa7N=JGau7YM$7+|gg3rl^HQ?I612d<=kbB1%uA1AIqZ4s|>USBnHouDDwp+y6^|A-73;i`#~ zX#v5@JmjpSNZd>%?gDBy22M0nIURxq$j}Zg4!qD@Qfr_e0q!$b5Pcp5iIM_<^g#i~ zK>w-JGF$x4@OlM21(lScqjRs9=PM#eR5soc<5a9T}XUF^4?Pgel#LOzK{T`8Aq$fe5*L(12Qz& z1#&ucV%CGa_Xc27n=OX~^s2DHgMehb(}IK6Wl$K+QTL^Fq~HikjV<7>=VPA8r`4~b z{(^T>nL!E=NSLqzj1_oj9z~vOMweSsxU~=~ZWxarcLOyGIuRQhRE-jFiPPs%`YPXH z23p4UL*iEsDtVKf0|CwvdiRZ?77jR-&swgQ6c(gzIUqwATcsm4@n^;aPeBm_(Ow$1 zL&8G>o^mx4NH8^lhvcI*H9R~#^>YN4Rc{L{Lb(*ew0$U3Y0r9K5IELM1$%!7APK0f zAj{rMh)Y;U@)W+9qG=a#wfZ5*9#ZyBPx*|bVggya@1dvK-#Ih{r1ytVG(ugWN zA@vA4wOu`(vud1!u!IQSXmvLh44o#}katH#k3^%@NxiNWUhgqDb?AsnLY!Nah+WmV zAxsL?S4|)a6>jPg;JY{NV88#^5FaXlX62yI^yqmXli{%>wqSXu|Uuc?v-tPsm3o|srSBH>l z4Ov~A!m6C72dT&sZAemncSZm;Eh#WML}}-mVS@~S!jt2{wyiA2f*Xo54jn^7;3U*k z_2K5Ai(`xoVAL*>;U7r{8tE0Hk5#oDKtVP4kI)MXiY)kdT)t~9+)BrJKm>* z-dIa%)-+b#9ORHgGCO zJ=L5`CTTd6=Wd{jl&Jna1>30tO(Deh)jo^pL?7#%bc=m)j?*tIYrFG!q$5zVKxI40 z*+Bd(FE&X?0i(j%6x!Kn0X0W)1P+Q<*Du20E0fP!9+vArE`3t@F&o%SWoFz?TrQ~# z+p8|g5u%6EHbwk=mBg&vrD}S$s0F-b+R94M10WCus`(s|Li}-QSSV0L%@$L@z$<4g zWf^>+kvxA-?!aPzjVK~>F1iZ7YFH-R)P0&4p%KktLHn1h@0E^zJ4*jw|fs( znebt0BW!GYi@$AoOf{F?qs_Ygm^vfbfY#AyaOY#YidbON$KJQ^L%>)2&1)mRBAK<`Vb1BK8Hc{K+22*xTUQR0Qxb~zl_^r+v?$nvAlQBcDuxR@$Y0= zPQxinO;T4K70Z0jnIzrI7tp3^nQC@yS2vQRC{U#1BQggWJM52>CCEuKU5d@g>7ur? z&u$-29wFR3NQKV23q7lJfCtK9HJxD%@36#B*f`}c8=-Drid8cE0#lVO02lHww+AVz zw~%bIr|nW(nlZIa_QCJe1Ex)x`@{qlt(rxhMPQ{!&7eS&Pdlq!^q!h(S(^TmPj*8gs6eu}dlFvV;j9XHeNRrBx)+gq@jW(u=#1 zqMsjUAbqlepy4qssCcFkktlW{tiwXPY-K}ovUlJ{75l7cG_1vD9Gbd8ja>q#7>$A) z6HPv#oX48315^mo0V$y27v<Eaho)QW~utgck8=y{Yh7x-%3bAgrmF`yeRJp@MEW8Wa^%CtZ8whEPFv{RAglD z<1?Rg4+(YpQg;u+b>|O%wM{|YK@Zd`nmAKEpt|;vlR9yOs^5_P!B*$>GC)5n~z&DzZk-w|0jmvla=;=uZus~h5zH+(x;X4 zY30DdX7$f&u1&xOiC>Pc+#~SGksltCPqz+yx{~2v0)_gc*!Xm{27GY%bnnPN;wt#h zX9D=oQxJZH&bMq>sYpIstq4Hw`ti)`cSl#Az6xJ?`P-+5|A#SvdzT{EtKL9#<76kwR literal 0 HcmV?d00001 diff --git a/notebooks/datasets/amazon/amazon.ipynb b/notebooks/datasets/amazon/amazon.ipynb index f5680c7..4a22c4c 100644 --- a/notebooks/datasets/amazon/amazon.ipynb +++ b/notebooks/datasets/amazon/amazon.ipynb @@ -471,18 +471,34 @@ { "cell_type": "markdown", "metadata": {}, - "source": "from brightdata.datasets import export_json, export_csv, export\n\n# Export to JSON\njson_file = export_json(data, \"amazon_results.json\")\nprint(f\"Exported to: {json_file}\")\n\n# Export to CSV\ncsv_file = export_csv(data, \"amazon_results.csv\")\nprint(f\"Exported to: {csv_file}\")\n\n# Or use auto-detect based on extension\n# export(data, \"results.json\")\n# export(data, \"results.csv\")\n\nprint(f\"\\nRecords: {len(data)}\")" + "source": [ + "from brightdata.datasets import export_json, export_csv, export\n", + "\n", + "# Export to JSON\n", + "json_file = export_json(data, \"amazon_results.json\")\n", + "print(f\"Exported to: {json_file}\")\n", + "\n", + "# Export to CSV\n", + "csv_file = export_csv(data, \"amazon_results.csv\")\n", + "print(f\"Exported to: {csv_file}\")\n", + "\n", + "# Or use auto-detect based on extension\n", + "# export(data, \"results.json\")\n", + "# export(data, \"results.csv\")\n", + "\n", + "print(f\"\\nRecords: {len(data)}\")" + ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "name": "python", - "version": "3.11.0" + "version": "3.11.10" } }, "nbformat": 4, diff --git a/pyproject.toml b/pyproject.toml index 5d63337..1d0f90e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ where = ["src"] [project] name = "brightdata-sdk" -version = "2.2.1" +version = "2.2.2" description = "Modern async-first Python SDK for Bright Data APIs" authors = [{name = "Bright Data", email = "support@brightdata.com"}] license = {text = "MIT"} diff --git a/src/brightdata/__init__.py b/src/brightdata/__init__.py index 244593b..3408909 100644 --- a/src/brightdata/__init__.py +++ b/src/brightdata/__init__.py @@ -72,8 +72,13 @@ SSLError, ) +# Export Scraper Studio models +from .scraper_studio.models import ScraperStudioJob, JobStatus + # Export services for advanced usage from .api.web_unlocker import WebUnlockerService +from .api.scraper_studio_service import ScraperStudioService +from .api.browser_service import BrowserService from .core.zone_manager import ZoneManager __all__ = [ @@ -124,7 +129,12 @@ "ZoneError", "NetworkError", "SSLError", + # Scraper Studio + "ScraperStudioJob", + "JobStatus", + "ScraperStudioService", # Services "WebUnlockerService", + "BrowserService", "ZoneManager", ] diff --git a/src/brightdata/api/browser/__init__.py b/src/brightdata/api/browser/__init__.py deleted file mode 100644 index c4eee11..0000000 --- a/src/brightdata/api/browser/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Browser API.""" diff --git a/src/brightdata/api/browser/browser_api.py b/src/brightdata/api/browser/browser_api.py deleted file mode 100644 index c4ef4ff..0000000 --- a/src/brightdata/api/browser/browser_api.py +++ /dev/null @@ -1 +0,0 @@ -"""Main browser API.""" diff --git a/src/brightdata/api/browser/browser_pool.py b/src/brightdata/api/browser/browser_pool.py deleted file mode 100644 index 10a095e..0000000 --- a/src/brightdata/api/browser/browser_pool.py +++ /dev/null @@ -1 +0,0 @@ -"""Connection pooling.""" diff --git a/src/brightdata/api/browser/config.py b/src/brightdata/api/browser/config.py deleted file mode 100644 index 8682d30..0000000 --- a/src/brightdata/api/browser/config.py +++ /dev/null @@ -1 +0,0 @@ -"""Browser configuration.""" diff --git a/src/brightdata/api/browser/session.py b/src/brightdata/api/browser/session.py deleted file mode 100644 index 10ab0d9..0000000 --- a/src/brightdata/api/browser/session.py +++ /dev/null @@ -1 +0,0 @@ -"""Browser sessions.""" diff --git a/src/brightdata/api/browser_service.py b/src/brightdata/api/browser_service.py new file mode 100644 index 0000000..97adf97 --- /dev/null +++ b/src/brightdata/api/browser_service.py @@ -0,0 +1,62 @@ +"""Browser API service — builds CDP WebSocket URLs for Playwright/Puppeteer.""" + +from typing import Optional + + +class BrowserService: + """ + Builds CDP WebSocket connection URLs for Bright Data's Browser API. + + Browser API provides cloud-hosted Chrome instances connected via the + Chrome DevTools Protocol (CDP). The SDK builds the connection URL; + you connect with Playwright, Puppeteer, or Selenium yourself. + + Example: + >>> client = BrightDataClient( + ... browser_username="brd-customer-hl_1cdf8003-zone-scraping_browser1", + ... browser_password="f05i50grymt3", + ... ) + >>> url = client.browser.get_connect_url() + >>> + >>> # Connect with Playwright: + >>> from playwright.async_api import async_playwright + >>> async with async_playwright() as pw: + ... browser = await pw.chromium.connect_over_cdp(url) + ... page = await browser.new_page() + ... await page.goto("https://example.com") + ... html = await page.content() + ... await browser.close() + """ + + DEFAULT_HOST = "brd.superproxy.io" + DEFAULT_PORT = 9222 + + def __init__( + self, + username: str, + password: str, + host: str = DEFAULT_HOST, + port: int = DEFAULT_PORT, + ): + self._username = username + self._password = password + self._host = host + self._port = port + + def get_connect_url(self, country: Optional[str] = None) -> str: + """ + Return the CDP WebSocket URL for connecting to a remote browser. + + Args: + country: Optional 2-letter country code for geo-targeting + (e.g., "us", "gb", "de"). Appended to the username + as ``-country-{code}`` per Bright Data's format. + + Returns: + WebSocket URL like + ``wss://brd-customer-abc-zone-mybrowser:pass@brd.superproxy.io:9222`` + """ + username = self._username + if country: + username = f"{username}-country-{country}" + return f"wss://{username}:{self._password}@{self._host}:{self._port}" diff --git a/src/brightdata/api/scraper_studio_service.py b/src/brightdata/api/scraper_studio_service.py new file mode 100644 index 0000000..c853e1e --- /dev/null +++ b/src/brightdata/api/scraper_studio_service.py @@ -0,0 +1,147 @@ +""" +Scraper Studio Service - High-level interface for Scraper Studio operations. + +Provides run/trigger/status/fetch methods following the same pattern as +ScrapeService and SearchService. + +All methods are async-only. For sync usage, use SyncBrightDataClient. +""" + +from typing import Dict, List, Any, Union, TYPE_CHECKING + +from ..scraper_studio.client import ScraperStudioAPIClient +from ..scraper_studio.models import ScraperStudioJob, JobStatus +from ..constants import SCRAPER_STUDIO_DEFAULT_TIMEOUT, SCRAPER_STUDIO_POLL_INTERVAL + +if TYPE_CHECKING: + from ..client import BrightDataClient + + +class ScraperStudioService: + """ + High-level service for Scraper Studio operations. + + Access via client.scraper_studio: + + >>> async with BrightDataClient() as client: + ... # High-level: trigger + poll + return data + ... data = await client.scraper_studio.run( + ... collector="c_abc123", + ... input={"url": "https://example.com/1"}, + ... ) + ... + ... # Manual control: trigger, then fetch later + ... job = await client.scraper_studio.trigger( + ... collector="c_abc123", + ... input={"url": "https://example.com/1"}, + ... ) + ... data = await job.wait_and_fetch(timeout=120) + """ + + def __init__(self, client: "BrightDataClient"): + self._client = client + self._api = ScraperStudioAPIClient(client.engine) + + async def run( + self, + collector: str, + input: Union[Dict[str, Any], List[Dict[str, Any]]], + timeout: int = SCRAPER_STUDIO_DEFAULT_TIMEOUT, + poll_interval: int = SCRAPER_STUDIO_POLL_INTERVAL, + ) -> List[Dict[str, Any]]: + """ + Trigger a scrape and wait for results. + + High-level method that handles trigger + poll + return. + Uses trigger_immediate internally. + + Args: + collector: Scraper collector ID (e.g., "c_abc123"). + input: Single input dict or list of input dicts. + Each dict contains scraper-specific fields (e.g., {"url": "..."}). + timeout: Maximum seconds to wait for results. + poll_interval: Seconds between poll attempts. + + Returns: + List of scraped records. + + Raises: + TimeoutError: If timeout is reached before data is ready. + APIError: If the API request fails. + """ + # Normalize list input to individual triggers + if isinstance(input, list): + # Trigger each input separately and collect results + all_data: List[Dict[str, Any]] = [] + for single_input in input: + response_id = await self._api.trigger_immediate(collector, single_input) + job = ScraperStudioJob(response_id=response_id, api_client=self._api) + data = await job.wait_and_fetch(timeout=timeout, poll_interval=poll_interval) + all_data.extend(data) + return all_data + else: + response_id = await self._api.trigger_immediate(collector, input) + job = ScraperStudioJob(response_id=response_id, api_client=self._api) + return await job.wait_and_fetch(timeout=timeout, poll_interval=poll_interval) + + async def trigger( + self, + collector: str, + input: Dict[str, Any], + ) -> ScraperStudioJob: + """ + Trigger a scrape and return a job object for manual control. + + Does not wait for results. Use job.wait_and_fetch() or job.fetch() + to retrieve data later. + + Args: + collector: Scraper collector ID. + input: Single input dict with scraper-specific fields. + + Returns: + ScraperStudioJob with response_id for polling. + + Raises: + APIError: If the trigger request fails. + """ + response_id = await self._api.trigger_immediate(collector, input) + return ScraperStudioJob(response_id=response_id, api_client=self._api) + + async def status( + self, + job_id: str, + ) -> JobStatus: + """ + Check the status of a job. + + Args: + job_id: Job ID (e.g., "j_abc123"). + + Returns: + JobStatus with status, success_rate, lines, etc. + + Raises: + APIError: If the status request fails. + """ + raw = await self._api.get_status(job_id) + return JobStatus.from_api_response(raw) + + async def fetch( + self, + response_id: str, + ) -> List[Dict[str, Any]]: + """ + Fetch results of a completed scrape. + + Args: + response_id: Response ID from trigger(). + + Returns: + List of scraped records. + + Raises: + DataNotReadyError: If data is not ready yet. + APIError: If the fetch request fails. + """ + return await self._api.fetch_immediate_result(response_id) diff --git a/src/brightdata/client.py b/src/brightdata/client.py index 0f69649..02bdc98 100644 --- a/src/brightdata/client.py +++ b/src/brightdata/client.py @@ -27,6 +27,8 @@ from .api.scrape_service import ScrapeService from .api.search_service import SearchService from .api.crawler_service import CrawlerService +from .api.scraper_studio_service import ScraperStudioService +from .api.browser_service import BrowserService from .datasets import DatasetsClient from .models import ScrapeResult from .types import AccountInfo @@ -67,7 +69,6 @@ class BrightDataClient: DEFAULT_TIMEOUT = 30 DEFAULT_WEB_UNLOCKER_ZONE = "sdk_unlocker" DEFAULT_SERP_ZONE = "sdk_serp" - DEFAULT_BROWSER_ZONE = "sdk_browser" # Environment variable name for API token TOKEN_ENV_VAR = "BRIGHTDATA_API_TOKEN" @@ -78,7 +79,10 @@ def __init__( timeout: int = DEFAULT_TIMEOUT, web_unlocker_zone: Optional[str] = None, serp_zone: Optional[str] = None, - browser_zone: Optional[str] = None, + browser_username: Optional[str] = None, + browser_password: Optional[str] = None, + browser_host: Optional[str] = None, + browser_port: Optional[int] = None, auto_create_zones: bool = True, validate_token: bool = False, rate_limit: Optional[float] = None, @@ -96,7 +100,11 @@ def __init__( timeout: Default timeout in seconds for all requests (default: 30) web_unlocker_zone: Zone name for web unlocker (default: "sdk_unlocker") serp_zone: Zone name for SERP API (default: "sdk_serp") - browser_zone: Zone name for browser API (default: "sdk_browser") + browser_username: Browser API username (or set BRIGHTDATA_BROWSERAPI_USERNAME env var). + Find at: https://brightdata.com/cp/zones + browser_password: Browser API password (or set BRIGHTDATA_BROWSERAPI_PASSWORD env var) + browser_host: Browser API host (default: "brd.superproxy.io") + browser_port: Browser API port (default: 9222) auto_create_zones: Automatically create zones if they don't exist (default: True) validate_token: Validate token by testing connection on init (default: False) rate_limit: Maximum requests per rate_period (default: 10). Set to None to disable. @@ -121,7 +129,10 @@ def __init__( self.timeout = timeout self.web_unlocker_zone = web_unlocker_zone or self.DEFAULT_WEB_UNLOCKER_ZONE self.serp_zone = serp_zone or self.DEFAULT_SERP_ZONE - self.browser_zone = browser_zone or self.DEFAULT_BROWSER_ZONE + self._browser_username = browser_username + self._browser_password = browser_password + self._browser_host = browser_host + self._browser_port = browser_port self.auto_create_zones = auto_create_zones self.engine = AsyncEngine( @@ -133,6 +144,8 @@ def __init__( self._crawler_service: Optional[CrawlerService] = None self._web_unlocker_service: Optional[WebUnlockerService] = None self._datasets_client: Optional[DatasetsClient] = None + self._scraper_studio_service: Optional[ScraperStudioService] = None + self._browser_service: Optional[BrowserService] = None self._zone_manager: Optional[ZoneManager] = None self._is_connected = False self._account_info: Optional[Dict[str, Any]] = None @@ -208,12 +221,9 @@ async def _ensure_zones(self) -> None: if self._zone_manager is None: self._zone_manager = ZoneManager(self.engine) - # Don't pass browser_zone to auto-creation because browser zones - # require additional configuration and cannot be auto-created await self._zone_manager.ensure_required_zones( web_unlocker_zone=self.web_unlocker_zone, serp_zone=self.serp_zone, - browser_zone=None, # Never auto-create browser zones ) self._zones_ensured = True @@ -313,6 +323,73 @@ def datasets(self) -> DatasetsClient: self._datasets_client = DatasetsClient(self.engine) return self._datasets_client + @property + def scraper_studio(self) -> ScraperStudioService: + """ + Access Scraper Studio services. + + Trigger and fetch results from user-created custom scrapers + (built via Bright Data's AI Agent, IDE, or templates). + + Returns: + ScraperStudioService instance + + Example: + >>> data = await client.scraper_studio.run( + ... collector="c_abc123", + ... input={"url": "https://example.com/page"}, + ... ) + """ + if self._scraper_studio_service is None: + self._scraper_studio_service = ScraperStudioService(self) + return self._scraper_studio_service + + @property + def browser(self) -> BrowserService: + """ + Access Browser API service. + + Builds CDP WebSocket URLs for connecting to Bright Data's cloud browsers + with Playwright, Puppeteer, or Selenium. + + Credentials are resolved in order: + 1. ``browser_username`` / ``browser_password`` passed to the client + 2. ``BRIGHTDATA_BROWSERAPI_USERNAME`` / ``BRIGHTDATA_BROWSERAPI_PASSWORD`` env vars + + Returns: + BrowserService instance + + Raises: + ValidationError: If no browser credentials are available + + Example: + >>> client = BrightDataClient( + ... browser_username="brd-customer-hl_1cdf8003-zone-scraping_browser1", + ... browser_password="f05i50grymt3", + ... ) + >>> url = client.browser.get_connect_url() + >>> # Connect with Playwright: + >>> browser = await pw.chromium.connect_over_cdp(url) + """ + if self._browser_service is None: + username = self._browser_username or os.getenv("BRIGHTDATA_BROWSERAPI_USERNAME") + password = self._browser_password or os.getenv("BRIGHTDATA_BROWSERAPI_PASSWORD") + if not username or not password: + raise ValidationError( + "Browser API credentials not provided. " + "Pass browser_username and browser_password to the client, or set " + "BRIGHTDATA_BROWSERAPI_USERNAME and BRIGHTDATA_BROWSERAPI_PASSWORD " + "environment variables. " + "Find credentials at: https://brightdata.com/cp/zones" + ) + self._browser_service = BrowserService( + username=username, + password=password, + host=self._browser_host or BrowserService.DEFAULT_HOST, + port=self._browser_port or BrowserService.DEFAULT_PORT, + ) + return self._browser_service + async def test_connection(self) -> bool: """ Test API connection and token validity. diff --git a/src/brightdata/constants.py b/src/brightdata/constants.py index e15b2ce..369f2d9 100644 --- a/src/brightdata/constants.py +++ b/src/brightdata/constants.py @@ -46,6 +46,13 @@ COST_PER_RECORD_YOUTUBE: float = 0.002 """Cost per record for YouTube scrapers.""" +# Scraper Studio defaults +SCRAPER_STUDIO_DEFAULT_TIMEOUT: int = 180 +"""Default timeout in seconds for Scraper Studio run() operations.""" + +SCRAPER_STUDIO_POLL_INTERVAL: int = 10 +"""Default interval in seconds between poll attempts for Scraper Studio.""" + # HTTP Status Codes HTTP_OK: int = 200 """HTTP 200 OK - Request succeeded.""" @@ -53,6 +60,9 @@ HTTP_CREATED: int = 201 """HTTP 201 Created - Resource created successfully.""" +HTTP_ACCEPTED: int = 202 +"""HTTP 202 Accepted - Request accepted for async processing.""" + HTTP_BAD_REQUEST: int = 400 """HTTP 400 Bad Request - Invalid request parameters.""" diff --git a/src/brightdata/core/zone_manager.py b/src/brightdata/core/zone_manager.py index 43c4a06..087053d 100644 --- a/src/brightdata/core/zone_manager.py +++ b/src/brightdata/core/zone_manager.py @@ -42,20 +42,14 @@ async def ensure_required_zones( self, web_unlocker_zone: str, serp_zone: Optional[str] = None, - browser_zone: Optional[str] = None, skip_verification: bool = False, ) -> None: """ Check if required zones exist and create them if they don't. - Important: Only unblocker and SERP zones can be auto-created. - Browser zones require additional configuration parameters (like "start" value) - and must be created manually in the Bright Data dashboard. - Args: web_unlocker_zone: Web unlocker zone name (will be created if missing) serp_zone: SERP zone name (optional, will be created if missing) - browser_zone: Browser zone name (NOT auto-created, pass None to skip) Raises: ZoneError: If zone creation or validation fails @@ -80,10 +74,6 @@ async def ensure_required_zones( zones_to_create.append((serp_zone, "serp")) logger.info(f"Need to create SERP zone: {serp_zone}") - # Browser zones are intentionally NOT checked here - # They require additional configuration (like "start" parameter) - # and must be created manually in the Bright Data dashboard - if not zones_to_create: logger.info("All required zones already exist") return diff --git a/src/brightdata/datasets/__init__.py b/src/brightdata/datasets/__init__.py index b39c9ae..6182d95 100644 --- a/src/brightdata/datasets/__init__.py +++ b/src/brightdata/datasets/__init__.py @@ -10,8 +10,22 @@ from .utils import export, export_json, export_jsonl, export_csv # Platform-specific datasets -from .linkedin import LinkedInPeopleProfiles, LinkedInCompanyProfiles, LinkedInJobListings -from .amazon import AmazonProducts, AmazonReviews, AmazonSellersInfo +from .linkedin import ( + LinkedInPeopleProfiles, + LinkedInCompanyProfiles, + LinkedInJobListings, + LinkedInPosts, + LinkedInProfilesJobListings, +) +from .amazon import ( + AmazonProducts, + AmazonReviews, + AmazonSellersInfo, + AmazonBestSellers, + AmazonProductsSearch, + AmazonProductsGlobal, + AmazonWalmart, +) from .crunchbase import CrunchbaseCompanies from .imdb import IMDBMovies from .nba import NBAPlayersStats @@ -20,7 +34,7 @@ from .companies_enriched import CompaniesEnriched from .employees_enriched import EmployeesEnriched from .glassdoor import GlassdoorCompanies, GlassdoorReviews, GlassdoorJobs -from .google_maps import GoogleMapsReviews +from .google_maps import GoogleMapsReviews, GoogleMapsFullInfo from .yelp import YelpBusinesses, YelpReviews from .zoominfo import ZoomInfoCompanies from .pitchbook import PitchBookCompanies @@ -34,10 +48,10 @@ from .manta import MantaBusinesses from .ventureradar import VentureRadarCompanies from .trustradius import TrustRadiusReviews -from .instagram import InstagramProfiles, InstagramPosts -from .tiktok import TikTokProfiles +from .instagram import InstagramProfiles, InstagramPosts, InstagramComments, InstagramReels +from .tiktok import TikTokProfiles, TikTokComments, TikTokPosts, TikTokShop from .real_estate import AustraliaRealEstate -from .walmart import WalmartProducts +from .walmart import WalmartProducts, WalmartSellersInfo from .mediamarkt import MediamarktProducts from .fendi import FendiProducts from .zalando import ZalandoProducts @@ -67,7 +81,7 @@ from .raymourflanigan import RaymourFlaniganProducts from .inmuebles24 import Inmuebles24Mexico from .mouser import MouserProducts -from .zillow import ZillowProperties +from .zillow import ZillowProperties, ZillowPriceHistory from .zonaprop import ZonapropArgentina from .metrocuadrado import MetrocuadradoProperties from .chileautos import ChileautosChile @@ -92,10 +106,62 @@ from .world_zipcodes import WorldZipcodes from .pinterest import PinterestPosts, PinterestProfiles from .shopee import ShopeeProducts -from .lazada import LazadaProducts +from .lazada import LazadaProducts, LazadaReviews, LazadaProductsSearch from .youtube import YouTubeProfiles, YouTubeVideos, YouTubeComments from .digikey import DigikeyProducts -from .facebook import FacebookPagesPosts +from .facebook import ( + FacebookPagesPosts, + FacebookComments, + FacebookPostsByUrl, + FacebookReels, + FacebookMarketplace, + FacebookCompanyReviews, + FacebookEvents, + FacebookProfiles, + FacebookPagesProfiles, + FacebookGroupPosts, +) +from .x_twitter import XTwitterPosts, XTwitterProfiles +from .reddit import RedditPosts, RedditComments +from .bluesky import BlueskyPosts, BlueskyTopProfiles +from .snapchat import SnapchatPosts +from .quora import QuoraPosts +from .vimeo import VimeoVideos +from .google_news import GoogleNews +from .wikipedia import WikipediaArticles +from .bbc import BBCNews +from .cnn import CNNNews +from .github import GithubRepositories +from .creative_commons import CreativeCommonsImages, CreativeCommons3DModels +from .google_play import GooglePlayStore, GooglePlayReviews +from .apple_appstore import AppleAppStore, AppleAppStoreReviews +from .ebay import EbayProducts +from .etsy import EtsyProducts +from .target import TargetProducts +from .wayfair import WayfairProducts +from .bestbuy import BestBuyProducts +from .myntra import MyntraProducts +from .ozon import OzonProducts +from .wildberries import WildberriesProducts +from .tokopedia import TokopediaProducts +from .google_shopping import GoogleShoppingProducts, GoogleShoppingSearchUS +from .mercadolivre import MercadolivreProducts +from .naver import NaverProducts +from .homedepot import HomeDepotUSProducts, HomeDepotCAProducts +from .lowes import LowesProducts +from .rona import RonaProducts +from .kroger import KrogerProducts +from .macys import MacysProducts +from .costco import CostcoProducts +from .bh import BHProducts +from .microcenter import MicroCenterProducts +from .autozone import AutozoneProducts +from .zoopla import ZooplaProperties +from .booking import BookingListingsSearch, BookingHotelListings +from .realtor import RealtorInternationalProperties +from .agoda import AgodaProperties +from .carsales import CarsalesListings +from .yahoo_finance import YahooFinanceBusinesses __all__ = [ # Client @@ -299,4 +365,128 @@ "DigikeyProducts", # Facebook "FacebookPagesPosts", + "FacebookComments", + "FacebookPostsByUrl", + "FacebookReels", + "FacebookMarketplace", + "FacebookCompanyReviews", + "FacebookEvents", + "FacebookProfiles", + "FacebookPagesProfiles", + "FacebookGroupPosts", + # LinkedIn (additional) + "LinkedInPosts", + "LinkedInProfilesJobListings", + # Amazon (additional) + "AmazonBestSellers", + "AmazonProductsSearch", + "AmazonProductsGlobal", + "AmazonWalmart", + # Instagram (additional) + "InstagramComments", + "InstagramReels", + # TikTok (additional) + "TikTokComments", + "TikTokPosts", + "TikTokShop", + # Google Maps (additional) + "GoogleMapsFullInfo", + # Walmart (additional) + "WalmartSellersInfo", + # Zillow (additional) + "ZillowPriceHistory", + # Lazada (additional) + "LazadaReviews", + "LazadaProductsSearch", + # X / Twitter + "XTwitterPosts", + "XTwitterProfiles", + # Reddit + "RedditPosts", + "RedditComments", + # Bluesky + "BlueskyPosts", + "BlueskyTopProfiles", + # Snapchat + "SnapchatPosts", + # Quora + "QuoraPosts", + # Vimeo + "VimeoVideos", + # Google News + "GoogleNews", + # Wikipedia + "WikipediaArticles", + # BBC + "BBCNews", + # CNN + "CNNNews", + # GitHub + "GithubRepositories", + # Creative Commons + "CreativeCommonsImages", + "CreativeCommons3DModels", + # Google Play + "GooglePlayStore", + "GooglePlayReviews", + # Apple App Store + "AppleAppStore", + "AppleAppStoreReviews", + # eBay + "EbayProducts", + # Etsy + "EtsyProducts", + # Target + "TargetProducts", + # Wayfair + "WayfairProducts", + # Best Buy + "BestBuyProducts", + # Myntra + "MyntraProducts", + # Ozon + "OzonProducts", + # Wildberries + "WildberriesProducts", + # Tokopedia + "TokopediaProducts", + # Google Shopping + "GoogleShoppingProducts", + "GoogleShoppingSearchUS", + # Mercado Livre + "MercadolivreProducts", + # Naver + "NaverProducts", + # Home Depot + "HomeDepotUSProducts", + "HomeDepotCAProducts", + # Lowe's + "LowesProducts", + # Rona + "RonaProducts", + # Kroger + "KrogerProducts", + # Macy's + "MacysProducts", + # Costco + "CostcoProducts", + # B&H + "BHProducts", + # Micro Center + "MicroCenterProducts", + # Autozone + "AutozoneProducts", + # Zoopla + "ZooplaProperties", + # Booking + "BookingListingsSearch", + "BookingHotelListings", + # Realtor + "RealtorInternationalProperties", + # Agoda + "AgodaProperties", + # Carsales + "CarsalesListings", + # Yahoo Finance + "YahooFinanceBusinesses", ] diff --git a/src/brightdata/datasets/agoda/__init__.py b/src/brightdata/datasets/agoda/__init__.py new file mode 100644 index 0000000..227ef86 --- /dev/null +++ b/src/brightdata/datasets/agoda/__init__.py @@ -0,0 +1,5 @@ +"""Agoda dataset.""" + +from .properties import AgodaProperties + +__all__ = ["AgodaProperties"] diff --git a/src/brightdata/datasets/agoda/properties.py b/src/brightdata/datasets/agoda/properties.py new file mode 100644 index 0000000..aa43f2d --- /dev/null +++ b/src/brightdata/datasets/agoda/properties.py @@ -0,0 +1,25 @@ +""" +Agoda Properties dataset. + +Agoda property listings dataset. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class AgodaProperties(BaseDataset): + """AgodaProperties dataset.""" + + DATASET_ID = "gd_m837ssst155rq3a1xo" + NAME = "agoda_properties" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/amazon/__init__.py b/src/brightdata/datasets/amazon/__init__.py index 5ab6998..54efa19 100644 --- a/src/brightdata/datasets/amazon/__init__.py +++ b/src/brightdata/datasets/amazon/__init__.py @@ -3,5 +3,17 @@ from .products import AmazonProducts from .reviews import AmazonReviews from .sellers import AmazonSellersInfo +from .best_sellers import AmazonBestSellers +from .products_search import AmazonProductsSearch +from .products_global import AmazonProductsGlobal +from .walmart import AmazonWalmart -__all__ = ["AmazonProducts", "AmazonReviews", "AmazonSellersInfo"] +__all__ = [ + "AmazonProducts", + "AmazonReviews", + "AmazonSellersInfo", + "AmazonBestSellers", + "AmazonProductsSearch", + "AmazonProductsGlobal", + "AmazonWalmart", +] diff --git a/src/brightdata/datasets/amazon/best_sellers.py b/src/brightdata/datasets/amazon/best_sellers.py new file mode 100644 index 0000000..c6f1c55 --- /dev/null +++ b/src/brightdata/datasets/amazon/best_sellers.py @@ -0,0 +1,25 @@ +""" +Amazon Best Sellers dataset. + +Amazon best-selling products with rankings, categories, and sales data. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class AmazonBestSellers(BaseDataset): + """AmazonBestSellers dataset.""" + + DATASET_ID = "gd_l1vijixj9g2vp7563" + NAME = "amazon_best_sellers" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/amazon/products_global.py b/src/brightdata/datasets/amazon/products_global.py new file mode 100644 index 0000000..5e3e8b1 --- /dev/null +++ b/src/brightdata/datasets/amazon/products_global.py @@ -0,0 +1,25 @@ +""" +Amazon Products Global dataset. + +Amazon product data across global marketplaces with localized pricing and availability. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class AmazonProductsGlobal(BaseDataset): + """AmazonProductsGlobal dataset.""" + + DATASET_ID = "gd_lwhideng15g8jg63s7" + NAME = "amazon_products_global" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/amazon/products_search.py b/src/brightdata/datasets/amazon/products_search.py new file mode 100644 index 0000000..cfa1832 --- /dev/null +++ b/src/brightdata/datasets/amazon/products_search.py @@ -0,0 +1,25 @@ +""" +Amazon Products Search dataset. + +Amazon product search results with listings, prices, and relevance data. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class AmazonProductsSearch(BaseDataset): + """AmazonProductsSearch dataset.""" + + DATASET_ID = "gd_lwdb4vjm1ehb499uxs" + NAME = "amazon_products_search" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/amazon/walmart.py b/src/brightdata/datasets/amazon/walmart.py new file mode 100644 index 0000000..a58b395 --- /dev/null +++ b/src/brightdata/datasets/amazon/walmart.py @@ -0,0 +1,25 @@ +""" +Amazon Walmart dataset. + +Amazon product data cross-referenced with Walmart for price comparison and availability. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class AmazonWalmart(BaseDataset): + """AmazonWalmart dataset.""" + + DATASET_ID = "gd_m4l6s4mn2g2rkx9lia" + NAME = "amazon_walmart" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/apple_appstore/__init__.py b/src/brightdata/datasets/apple_appstore/__init__.py new file mode 100644 index 0000000..804a553 --- /dev/null +++ b/src/brightdata/datasets/apple_appstore/__init__.py @@ -0,0 +1,6 @@ +"""Apple App Store datasets.""" + +from .store import AppleAppStore +from .reviews import AppleAppStoreReviews + +__all__ = ["AppleAppStore", "AppleAppStoreReviews"] diff --git a/src/brightdata/datasets/apple_appstore/reviews.py b/src/brightdata/datasets/apple_appstore/reviews.py new file mode 100644 index 0000000..5e9fe9e --- /dev/null +++ b/src/brightdata/datasets/apple_appstore/reviews.py @@ -0,0 +1,25 @@ +""" +Apple App Store Reviews dataset. + +Apple App Store reviews dataset. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class AppleAppStoreReviews(BaseDataset): + """AppleAppStoreReviews dataset.""" + + DATASET_ID = "gd_m734msue16e0adkbit" + NAME = "apple_app_store_reviews" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/apple_appstore/store.py b/src/brightdata/datasets/apple_appstore/store.py new file mode 100644 index 0000000..2bf9b47 --- /dev/null +++ b/src/brightdata/datasets/apple_appstore/store.py @@ -0,0 +1,25 @@ +""" +Apple App Store dataset. + +Apple App Store apps dataset. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class AppleAppStore(BaseDataset): + """AppleAppStore dataset.""" + + DATASET_ID = "gd_lsk9ki3u2iishmwrui" + NAME = "apple_app_store" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/autozone/__init__.py b/src/brightdata/datasets/autozone/__init__.py new file mode 100644 index 0000000..e8d186f --- /dev/null +++ b/src/brightdata/datasets/autozone/__init__.py @@ -0,0 +1,5 @@ +"""AutoZone dataset.""" + +from .products import AutozoneProducts + +__all__ = ["AutozoneProducts"] diff --git a/src/brightdata/datasets/autozone/products.py b/src/brightdata/datasets/autozone/products.py new file mode 100644 index 0000000..bd46151 --- /dev/null +++ b/src/brightdata/datasets/autozone/products.py @@ -0,0 +1,25 @@ +""" +AutoZone Products dataset. + +AutoZone products dataset. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class AutozoneProducts(BaseDataset): + """AutozoneProducts dataset.""" + + DATASET_ID = "gd_mkcnp8yy1kf8oxpvij" + NAME = "autozone_products" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/bbc/__init__.py b/src/brightdata/datasets/bbc/__init__.py new file mode 100644 index 0000000..c065342 --- /dev/null +++ b/src/brightdata/datasets/bbc/__init__.py @@ -0,0 +1,5 @@ +"""BBC dataset.""" + +from .news import BBCNews + +__all__ = ["BBCNews"] diff --git a/src/brightdata/datasets/bbc/news.py b/src/brightdata/datasets/bbc/news.py new file mode 100644 index 0000000..e53fa35 --- /dev/null +++ b/src/brightdata/datasets/bbc/news.py @@ -0,0 +1,25 @@ +""" +BBC News dataset. + +BBC news articles dataset. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class BBCNews(BaseDataset): + """BBCNews dataset.""" + + DATASET_ID = "gd_ly5lkfzd1h8c85feyh" + NAME = "bbc_news" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/bestbuy/__init__.py b/src/brightdata/datasets/bestbuy/__init__.py new file mode 100644 index 0000000..58e7182 --- /dev/null +++ b/src/brightdata/datasets/bestbuy/__init__.py @@ -0,0 +1,5 @@ +"""Best Buy dataset.""" + +from .products import BestBuyProducts + +__all__ = ["BestBuyProducts"] diff --git a/src/brightdata/datasets/bestbuy/products.py b/src/brightdata/datasets/bestbuy/products.py new file mode 100644 index 0000000..3deb6f1 --- /dev/null +++ b/src/brightdata/datasets/bestbuy/products.py @@ -0,0 +1,25 @@ +""" +Best Buy Products dataset. + +Best Buy products dataset. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class BestBuyProducts(BaseDataset): + """BestBuyProducts dataset.""" + + DATASET_ID = "gd_ltre1jqe1jfr7cccf" + NAME = "bestbuy_products" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/bh/__init__.py b/src/brightdata/datasets/bh/__init__.py new file mode 100644 index 0000000..03d6290 --- /dev/null +++ b/src/brightdata/datasets/bh/__init__.py @@ -0,0 +1,5 @@ +"""B&H Photo dataset.""" + +from .products import BHProducts + +__all__ = ["BHProducts"] diff --git a/src/brightdata/datasets/bh/products.py b/src/brightdata/datasets/bh/products.py new file mode 100644 index 0000000..6f12ff8 --- /dev/null +++ b/src/brightdata/datasets/bh/products.py @@ -0,0 +1,25 @@ +""" +B&H Photo Products dataset. + +B&H Photo products dataset. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class BHProducts(BaseDataset): + """BHProducts dataset.""" + + DATASET_ID = "gd_mkce0sox1mchrlpp8g" + NAME = "bh_products" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/bluesky/__init__.py b/src/brightdata/datasets/bluesky/__init__.py new file mode 100644 index 0000000..8bfe7ed --- /dev/null +++ b/src/brightdata/datasets/bluesky/__init__.py @@ -0,0 +1,6 @@ +"""Bluesky datasets.""" + +from .posts import BlueskyPosts +from .top_profiles import BlueskyTopProfiles + +__all__ = ["BlueskyPosts", "BlueskyTopProfiles"] diff --git a/src/brightdata/datasets/bluesky/posts.py b/src/brightdata/datasets/bluesky/posts.py new file mode 100644 index 0000000..a7bbf49 --- /dev/null +++ b/src/brightdata/datasets/bluesky/posts.py @@ -0,0 +1,25 @@ +""" +Bluesky Posts dataset. + +Bluesky posts dataset. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class BlueskyPosts(BaseDataset): + """BlueskyPosts dataset.""" + + DATASET_ID = "gd_m6hn4r5s27zfhc7w4" + NAME = "bluesky_posts" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/bluesky/top_profiles.py b/src/brightdata/datasets/bluesky/top_profiles.py new file mode 100644 index 0000000..299f101 --- /dev/null +++ b/src/brightdata/datasets/bluesky/top_profiles.py @@ -0,0 +1,25 @@ +""" +Bluesky Top Profiles dataset. + +Top 500 Bluesky profiles dataset. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class BlueskyTopProfiles(BaseDataset): + """BlueskyTopProfiles dataset.""" + + DATASET_ID = "gd_m45p78dl1m017wi5lj" + NAME = "bluesky_top_profiles" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/booking/__init__.py b/src/brightdata/datasets/booking/__init__.py new file mode 100644 index 0000000..e003247 --- /dev/null +++ b/src/brightdata/datasets/booking/__init__.py @@ -0,0 +1,6 @@ +"""Booking.com datasets.""" + +from .listings_search import BookingListingsSearch +from .hotel_listings import BookingHotelListings + +__all__ = ["BookingListingsSearch", "BookingHotelListings"] diff --git a/src/brightdata/datasets/booking/hotel_listings.py b/src/brightdata/datasets/booking/hotel_listings.py new file mode 100644 index 0000000..43afa1c --- /dev/null +++ b/src/brightdata/datasets/booking/hotel_listings.py @@ -0,0 +1,25 @@ +""" +Booking Hotel Listings dataset. + +Booking.com hotel listings dataset. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class BookingHotelListings(BaseDataset): + """BookingHotelListings dataset.""" + + DATASET_ID = "gd_m5mbdl081229ln6t4a" + NAME = "booking_hotel_listings" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/booking/listings_search.py b/src/brightdata/datasets/booking/listings_search.py new file mode 100644 index 0000000..edf9abe --- /dev/null +++ b/src/brightdata/datasets/booking/listings_search.py @@ -0,0 +1,25 @@ +""" +Booking Listings Search dataset. + +Booking.com listings search dataset. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class BookingListingsSearch(BaseDataset): + """BookingListingsSearch dataset.""" + + DATASET_ID = "gd_m4bf7a917zfezv9d5" + NAME = "booking_listings_search" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/carsales/__init__.py b/src/brightdata/datasets/carsales/__init__.py new file mode 100644 index 0000000..6b38482 --- /dev/null +++ b/src/brightdata/datasets/carsales/__init__.py @@ -0,0 +1,5 @@ +"""Carsales dataset.""" + +from .listings import CarsalesListings + +__all__ = ["CarsalesListings"] diff --git a/src/brightdata/datasets/carsales/listings.py b/src/brightdata/datasets/carsales/listings.py new file mode 100644 index 0000000..0933b81 --- /dev/null +++ b/src/brightdata/datasets/carsales/listings.py @@ -0,0 +1,25 @@ +""" +Carsales Listings dataset. + +Carsales car listings dataset. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class CarsalesListings(BaseDataset): + """CarsalesListings dataset.""" + + DATASET_ID = "gd_m8h7qkn317z9rvlngb" + NAME = "carsales_listings" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/client.py b/src/brightdata/datasets/client.py index 1e03a77..8759762 100644 --- a/src/brightdata/datasets/client.py +++ b/src/brightdata/datasets/client.py @@ -5,8 +5,22 @@ from typing import List, Optional, TYPE_CHECKING from .models import DatasetInfo -from .linkedin import LinkedInPeopleProfiles, LinkedInCompanyProfiles, LinkedInJobListings -from .amazon import AmazonProducts, AmazonReviews, AmazonSellersInfo +from .linkedin import ( + LinkedInPeopleProfiles, + LinkedInCompanyProfiles, + LinkedInJobListings, + LinkedInPosts, + LinkedInProfilesJobListings, +) +from .amazon import ( + AmazonProducts, + AmazonReviews, + AmazonSellersInfo, + AmazonBestSellers, + AmazonProductsSearch, + AmazonProductsGlobal, + AmazonWalmart, +) from .crunchbase import CrunchbaseCompanies from .imdb import IMDBMovies from .nba import NBAPlayersStats @@ -15,7 +29,7 @@ from .companies_enriched import CompaniesEnriched from .employees_enriched import EmployeesEnriched from .glassdoor import GlassdoorCompanies, GlassdoorReviews, GlassdoorJobs -from .google_maps import GoogleMapsReviews +from .google_maps import GoogleMapsReviews, GoogleMapsFullInfo from .yelp import YelpBusinesses, YelpReviews from .zoominfo import ZoomInfoCompanies from .pitchbook import PitchBookCompanies @@ -29,10 +43,10 @@ from .manta import MantaBusinesses from .ventureradar import VentureRadarCompanies from .trustradius import TrustRadiusReviews -from .instagram import InstagramProfiles, InstagramPosts -from .tiktok import TikTokProfiles +from .instagram import InstagramProfiles, InstagramPosts, InstagramComments, InstagramReels +from .tiktok import TikTokProfiles, TikTokComments, TikTokPosts, TikTokShop from .real_estate import AustraliaRealEstate -from .walmart import WalmartProducts +from .walmart import WalmartProducts, WalmartSellersInfo from .mediamarkt import MediamarktProducts from .fendi import FendiProducts from .zalando import ZalandoProducts @@ -62,7 +76,7 @@ from .raymourflanigan import RaymourFlaniganProducts from .inmuebles24 import Inmuebles24Mexico from .mouser import MouserProducts -from .zillow import ZillowProperties +from .zillow import ZillowProperties, ZillowPriceHistory from .zonaprop import ZonapropArgentina from .metrocuadrado import MetrocuadradoProperties from .chileautos import ChileautosChile @@ -87,10 +101,62 @@ from .world_zipcodes import WorldZipcodes from .pinterest import PinterestPosts, PinterestProfiles from .shopee import ShopeeProducts -from .lazada import LazadaProducts +from .lazada import LazadaProducts, LazadaReviews, LazadaProductsSearch from .youtube import YouTubeProfiles, YouTubeVideos, YouTubeComments from .digikey import DigikeyProducts -from .facebook import FacebookPagesPosts +from .facebook import ( + FacebookPagesPosts, + FacebookComments, + FacebookPostsByUrl, + FacebookReels, + FacebookMarketplace, + FacebookCompanyReviews, + FacebookEvents, + FacebookProfiles, + FacebookPagesProfiles, + FacebookGroupPosts, +) +from .x_twitter import XTwitterPosts, XTwitterProfiles +from .reddit import RedditPosts, RedditComments +from .bluesky import BlueskyPosts, BlueskyTopProfiles +from .snapchat import SnapchatPosts +from .quora import QuoraPosts +from .vimeo import VimeoVideos +from .google_news import GoogleNews +from .wikipedia import WikipediaArticles +from .bbc import BBCNews +from .cnn import CNNNews +from .github import GithubRepositories +from .creative_commons import CreativeCommonsImages, CreativeCommons3DModels +from .google_play import GooglePlayStore, GooglePlayReviews +from .apple_appstore import AppleAppStore, AppleAppStoreReviews +from .ebay import EbayProducts +from .etsy import EtsyProducts +from .target import TargetProducts +from .wayfair import WayfairProducts +from .bestbuy import BestBuyProducts +from .myntra import MyntraProducts +from .ozon import OzonProducts +from .wildberries import WildberriesProducts +from .tokopedia import TokopediaProducts +from .google_shopping import GoogleShoppingProducts, GoogleShoppingSearchUS +from .mercadolivre import MercadolivreProducts +from .naver import NaverProducts +from .homedepot import HomeDepotUSProducts, HomeDepotCAProducts +from .lowes import LowesProducts +from .rona import RonaProducts +from .kroger import KrogerProducts +from .macys import MacysProducts +from .costco import CostcoProducts +from .bh import BHProducts +from .microcenter import MicroCenterProducts +from .autozone import AutozoneProducts +from .zoopla import ZooplaProperties +from .booking import BookingListingsSearch, BookingHotelListings +from .realtor import RealtorInternationalProperties +from .agoda import AgodaProperties +from .carsales import CarsalesListings +from .yahoo_finance import YahooFinanceBusinesses if TYPE_CHECKING: from ..core.async_engine import AsyncEngine @@ -226,6 +292,87 @@ def __init__(self, engine: "AsyncEngine"): self._youtube_comments: Optional[YouTubeComments] = None self._digikey_products: Optional[DigikeyProducts] = None self._facebook_pages_posts: Optional[FacebookPagesPosts] = None + # New datasets - Social Media + self._facebook_comments: Optional[FacebookComments] = None + self._facebook_posts_by_url: Optional[FacebookPostsByUrl] = None + self._facebook_reels: Optional[FacebookReels] = None + self._facebook_marketplace: Optional[FacebookMarketplace] = None + self._facebook_company_reviews: Optional[FacebookCompanyReviews] = None + self._facebook_events: Optional[FacebookEvents] = None + self._facebook_profiles: Optional[FacebookProfiles] = None + self._facebook_pages_profiles: Optional[FacebookPagesProfiles] = None + self._facebook_group_posts: Optional[FacebookGroupPosts] = None + self._tiktok_comments: Optional[TikTokComments] = None + self._tiktok_posts: Optional[TikTokPosts] = None + self._tiktok_shop: Optional[TikTokShop] = None + self._instagram_comments: Optional[InstagramComments] = None + self._instagram_reels: Optional[InstagramReels] = None + self._linkedin_posts: Optional[LinkedInPosts] = None + self._linkedin_profiles_job_listings: Optional[LinkedInProfilesJobListings] = None + self._x_twitter_posts: Optional[XTwitterPosts] = None + self._x_twitter_profiles: Optional[XTwitterProfiles] = None + self._reddit_posts: Optional[RedditPosts] = None + self._reddit_comments: Optional[RedditComments] = None + self._bluesky_posts: Optional[BlueskyPosts] = None + self._bluesky_top_profiles: Optional[BlueskyTopProfiles] = None + self._snapchat_posts: Optional[SnapchatPosts] = None + self._quora_posts: Optional[QuoraPosts] = None + self._vimeo_videos: Optional[VimeoVideos] = None + # New datasets - News/Content + self._google_news: Optional[GoogleNews] = None + self._wikipedia_articles: Optional[WikipediaArticles] = None + self._bbc_news: Optional[BBCNews] = None + self._cnn_news: Optional[CNNNews] = None + self._github_repositories: Optional[GithubRepositories] = None + self._creative_commons_images: Optional[CreativeCommonsImages] = None + self._creative_commons_3d_models: Optional[CreativeCommons3DModels] = None + # New datasets - App Stores + self._google_play_store: Optional[GooglePlayStore] = None + self._google_play_reviews: Optional[GooglePlayReviews] = None + self._apple_app_store: Optional[AppleAppStore] = None + self._apple_app_store_reviews: Optional[AppleAppStoreReviews] = None + # New datasets - E-commerce + self._amazon_best_sellers: Optional[AmazonBestSellers] = None + self._amazon_products_search: Optional[AmazonProductsSearch] = None + self._amazon_products_global: Optional[AmazonProductsGlobal] = None + self._amazon_walmart: Optional[AmazonWalmart] = None + self._walmart_sellers_info: Optional[WalmartSellersInfo] = None + self._ebay_products: Optional[EbayProducts] = None + self._etsy_products: Optional[EtsyProducts] = None + self._target_products: Optional[TargetProducts] = None + self._wayfair_products: Optional[WayfairProducts] = None + self._bestbuy_products: Optional[BestBuyProducts] = None + self._myntra_products: Optional[MyntraProducts] = None + self._ozon_products: Optional[OzonProducts] = None + self._wildberries_products: Optional[WildberriesProducts] = None + self._tokopedia_products: Optional[TokopediaProducts] = None + self._google_shopping_products: Optional[GoogleShoppingProducts] = None + self._google_shopping_search_us: Optional[GoogleShoppingSearchUS] = None + self._mercadolivre_products: Optional[MercadolivreProducts] = None + self._naver_products: Optional[NaverProducts] = None + self._lazada_reviews: Optional[LazadaReviews] = None + self._lazada_products_search: Optional[LazadaProductsSearch] = None + self._homedepot_us_products: Optional[HomeDepotUSProducts] = None + self._homedepot_ca_products: Optional[HomeDepotCAProducts] = None + self._lowes_products: Optional[LowesProducts] = None + self._rona_products: Optional[RonaProducts] = None + self._kroger_products: Optional[KrogerProducts] = None + self._macys_products: Optional[MacysProducts] = None + self._costco_products: Optional[CostcoProducts] = None + self._bh_products: Optional[BHProducts] = None + self._microcenter_products: Optional[MicroCenterProducts] = None + self._autozone_products: Optional[AutozoneProducts] = None + # New datasets - Real Estate/Travel + self._zillow_price_history: Optional[ZillowPriceHistory] = None + self._zoopla_properties: Optional[ZooplaProperties] = None + self._booking_listings_search: Optional[BookingListingsSearch] = None + self._booking_hotel_listings: Optional[BookingHotelListings] = None + self._realtor_international_properties: Optional[RealtorInternationalProperties] = None + self._agoda_properties: Optional[AgodaProperties] = None + self._carsales_listings: Optional[CarsalesListings] = None + # New datasets - Finance/Maps + self._yahoo_finance_businesses: Optional[YahooFinanceBusinesses] = None + self._google_maps_full_info: Optional[GoogleMapsFullInfo] = None async def list(self) -> List[DatasetInfo]: """ @@ -949,3 +1096,540 @@ def facebook_pages_posts(self) -> FacebookPagesPosts: if self._facebook_pages_posts is None: self._facebook_pages_posts = FacebookPagesPosts(self._engine) return self._facebook_pages_posts + + # --- New dataset properties - Social Media --- + + @property + def facebook_comments(self) -> FacebookComments: + """Facebook Comments dataset.""" + if self._facebook_comments is None: + self._facebook_comments = FacebookComments(self._engine) + return self._facebook_comments + + @property + def facebook_posts_by_url(self) -> FacebookPostsByUrl: + """Facebook Posts by URL dataset.""" + if self._facebook_posts_by_url is None: + self._facebook_posts_by_url = FacebookPostsByUrl(self._engine) + return self._facebook_posts_by_url + + @property + def facebook_reels(self) -> FacebookReels: + """Facebook Reels dataset.""" + if self._facebook_reels is None: + self._facebook_reels = FacebookReels(self._engine) + return self._facebook_reels + + @property + def facebook_marketplace(self) -> FacebookMarketplace: + """Facebook Marketplace dataset.""" + if self._facebook_marketplace is None: + self._facebook_marketplace = FacebookMarketplace(self._engine) + return self._facebook_marketplace + + @property + def facebook_company_reviews(self) -> FacebookCompanyReviews: + """Facebook Company Reviews dataset.""" + if self._facebook_company_reviews is None: + self._facebook_company_reviews = FacebookCompanyReviews(self._engine) + return self._facebook_company_reviews + + @property + def facebook_events(self) -> FacebookEvents: + """Facebook Events dataset.""" + if self._facebook_events is None: + self._facebook_events = FacebookEvents(self._engine) + return self._facebook_events + + @property + def facebook_profiles(self) -> FacebookProfiles: + """Facebook Profiles dataset.""" + if self._facebook_profiles is None: + self._facebook_profiles = FacebookProfiles(self._engine) + return self._facebook_profiles + + @property + def facebook_pages_profiles(self) -> FacebookPagesProfiles: + """Facebook Pages and Profiles dataset.""" + if self._facebook_pages_profiles is None: + self._facebook_pages_profiles = FacebookPagesProfiles(self._engine) + return self._facebook_pages_profiles + + @property + def facebook_group_posts(self) -> FacebookGroupPosts: + """Facebook Group Posts dataset.""" + if self._facebook_group_posts is None: + self._facebook_group_posts = FacebookGroupPosts(self._engine) + return self._facebook_group_posts + + @property + def tiktok_comments(self) -> TikTokComments: + """TikTok Comments dataset.""" + if self._tiktok_comments is None: + self._tiktok_comments = TikTokComments(self._engine) + return self._tiktok_comments + + @property + def tiktok_posts(self) -> TikTokPosts: + """TikTok Posts dataset.""" + if self._tiktok_posts is None: + self._tiktok_posts = TikTokPosts(self._engine) + return self._tiktok_posts + + @property + def tiktok_shop(self) -> TikTokShop: + """TikTok Shop dataset.""" + if self._tiktok_shop is None: + self._tiktok_shop = TikTokShop(self._engine) + return self._tiktok_shop + + @property + def instagram_comments(self) -> InstagramComments: + """Instagram Comments dataset.""" + if self._instagram_comments is None: + self._instagram_comments = InstagramComments(self._engine) + return self._instagram_comments + + @property + def instagram_reels(self) -> InstagramReels: + """Instagram Reels dataset.""" + if self._instagram_reels is None: + self._instagram_reels = InstagramReels(self._engine) + return self._instagram_reels + + @property + def linkedin_posts(self) -> LinkedInPosts: + """LinkedIn Posts dataset.""" + if self._linkedin_posts is None: + self._linkedin_posts = LinkedInPosts(self._engine) + return self._linkedin_posts + + @property + def linkedin_profiles_job_listings(self) -> LinkedInProfilesJobListings: + """LinkedIn Profiles Job Listings dataset.""" + if self._linkedin_profiles_job_listings is None: + self._linkedin_profiles_job_listings = LinkedInProfilesJobListings(self._engine) + return self._linkedin_profiles_job_listings + + @property + def x_twitter_posts(self) -> XTwitterPosts: + """X (Twitter) Posts dataset.""" + if self._x_twitter_posts is None: + self._x_twitter_posts = XTwitterPosts(self._engine) + return self._x_twitter_posts + + @property + def x_twitter_profiles(self) -> XTwitterProfiles: + """X (Twitter) Profiles dataset.""" + if self._x_twitter_profiles is None: + self._x_twitter_profiles = XTwitterProfiles(self._engine) + return self._x_twitter_profiles + + @property + def reddit_posts(self) -> RedditPosts: + """Reddit Posts dataset.""" + if self._reddit_posts is None: + self._reddit_posts = RedditPosts(self._engine) + return self._reddit_posts + + @property + def reddit_comments(self) -> RedditComments: + """Reddit Comments dataset.""" + if self._reddit_comments is None: + self._reddit_comments = RedditComments(self._engine) + return self._reddit_comments + + @property + def bluesky_posts(self) -> BlueskyPosts: + """Bluesky Posts dataset.""" + if self._bluesky_posts is None: + self._bluesky_posts = BlueskyPosts(self._engine) + return self._bluesky_posts + + @property + def bluesky_top_profiles(self) -> BlueskyTopProfiles: + """Top 500 Bluesky Profiles dataset.""" + if self._bluesky_top_profiles is None: + self._bluesky_top_profiles = BlueskyTopProfiles(self._engine) + return self._bluesky_top_profiles + + @property + def snapchat_posts(self) -> SnapchatPosts: + """Snapchat Posts dataset.""" + if self._snapchat_posts is None: + self._snapchat_posts = SnapchatPosts(self._engine) + return self._snapchat_posts + + @property + def quora_posts(self) -> QuoraPosts: + """Quora Posts dataset.""" + if self._quora_posts is None: + self._quora_posts = QuoraPosts(self._engine) + return self._quora_posts + + @property + def vimeo_videos(self) -> VimeoVideos: + """Vimeo Videos dataset.""" + if self._vimeo_videos is None: + self._vimeo_videos = VimeoVideos(self._engine) + return self._vimeo_videos + + # --- New dataset properties - News/Content --- + + @property + def google_news(self) -> GoogleNews: + """Google News dataset.""" + if self._google_news is None: + self._google_news = GoogleNews(self._engine) + return self._google_news + + @property + def wikipedia_articles(self) -> WikipediaArticles: + """Wikipedia Articles dataset.""" + if self._wikipedia_articles is None: + self._wikipedia_articles = WikipediaArticles(self._engine) + return self._wikipedia_articles + + @property + def bbc_news(self) -> BBCNews: + """BBC News dataset.""" + if self._bbc_news is None: + self._bbc_news = BBCNews(self._engine) + return self._bbc_news + + @property + def cnn_news(self) -> CNNNews: + """CNN News dataset.""" + if self._cnn_news is None: + self._cnn_news = CNNNews(self._engine) + return self._cnn_news + + @property + def github_repositories(self) -> GithubRepositories: + """GitHub Repositories dataset.""" + if self._github_repositories is None: + self._github_repositories = GithubRepositories(self._engine) + return self._github_repositories + + @property + def creative_commons_images(self) -> CreativeCommonsImages: + """Creative Commons Images dataset.""" + if self._creative_commons_images is None: + self._creative_commons_images = CreativeCommonsImages(self._engine) + return self._creative_commons_images + + @property + def creative_commons_3d_models(self) -> CreativeCommons3DModels: + """Creative Commons 3D Models dataset.""" + if self._creative_commons_3d_models is None: + self._creative_commons_3d_models = CreativeCommons3DModels(self._engine) + return self._creative_commons_3d_models + + # --- New dataset properties - App Stores --- + + @property + def google_play_store(self) -> GooglePlayStore: + """Google Play Store dataset.""" + if self._google_play_store is None: + self._google_play_store = GooglePlayStore(self._engine) + return self._google_play_store + + @property + def google_play_reviews(self) -> GooglePlayReviews: + """Google Play Store Reviews dataset.""" + if self._google_play_reviews is None: + self._google_play_reviews = GooglePlayReviews(self._engine) + return self._google_play_reviews + + @property + def apple_app_store(self) -> AppleAppStore: + """Apple App Store dataset.""" + if self._apple_app_store is None: + self._apple_app_store = AppleAppStore(self._engine) + return self._apple_app_store + + @property + def apple_app_store_reviews(self) -> AppleAppStoreReviews: + """Apple App Store Reviews dataset.""" + if self._apple_app_store_reviews is None: + self._apple_app_store_reviews = AppleAppStoreReviews(self._engine) + return self._apple_app_store_reviews + + # --- New dataset properties - E-commerce --- + + @property + def amazon_best_sellers(self) -> AmazonBestSellers: + """Amazon Best Sellers dataset.""" + if self._amazon_best_sellers is None: + self._amazon_best_sellers = AmazonBestSellers(self._engine) + return self._amazon_best_sellers + + @property + def amazon_products_search(self) -> AmazonProductsSearch: + """Amazon Products Search dataset.""" + if self._amazon_products_search is None: + self._amazon_products_search = AmazonProductsSearch(self._engine) + return self._amazon_products_search + + @property + def amazon_products_global(self) -> AmazonProductsGlobal: + """Amazon Products Global dataset.""" + if self._amazon_products_global is None: + self._amazon_products_global = AmazonProductsGlobal(self._engine) + return self._amazon_products_global + + @property + def amazon_walmart(self) -> AmazonWalmart: + """Amazon Walmart dataset.""" + if self._amazon_walmart is None: + self._amazon_walmart = AmazonWalmart(self._engine) + return self._amazon_walmart + + @property + def walmart_sellers_info(self) -> WalmartSellersInfo: + """Walmart Sellers Info dataset.""" + if self._walmart_sellers_info is None: + self._walmart_sellers_info = WalmartSellersInfo(self._engine) + return self._walmart_sellers_info + + @property + def ebay_products(self) -> EbayProducts: + """eBay Products dataset.""" + if self._ebay_products is None: + self._ebay_products = EbayProducts(self._engine) + return self._ebay_products + + @property + def etsy_products(self) -> EtsyProducts: + """Etsy Products dataset.""" + if self._etsy_products is None: + self._etsy_products = EtsyProducts(self._engine) + return self._etsy_products + + @property + def target_products(self) -> TargetProducts: + """Target Products dataset.""" + if self._target_products is None: + self._target_products = TargetProducts(self._engine) + return self._target_products + + @property + def wayfair_products(self) -> WayfairProducts: + """Wayfair Products dataset.""" + if self._wayfair_products is None: + self._wayfair_products = WayfairProducts(self._engine) + return self._wayfair_products + + @property + def bestbuy_products(self) -> BestBuyProducts: + """Best Buy Products dataset.""" + if self._bestbuy_products is None: + self._bestbuy_products = BestBuyProducts(self._engine) + return self._bestbuy_products + + @property + def myntra_products(self) -> MyntraProducts: + """Myntra Products dataset.""" + if self._myntra_products is None: + self._myntra_products = MyntraProducts(self._engine) + return self._myntra_products + + @property + def ozon_products(self) -> OzonProducts: + """Ozon.ru Products dataset.""" + if self._ozon_products is None: + self._ozon_products = OzonProducts(self._engine) + return self._ozon_products + + @property + def wildberries_products(self) -> WildberriesProducts: + """Wildberries.ru Products dataset.""" + if self._wildberries_products is None: + self._wildberries_products = WildberriesProducts(self._engine) + return self._wildberries_products + + @property + def tokopedia_products(self) -> TokopediaProducts: + """Tokopedia Products dataset.""" + if self._tokopedia_products is None: + self._tokopedia_products = TokopediaProducts(self._engine) + return self._tokopedia_products + + @property + def google_shopping_products(self) -> GoogleShoppingProducts: + """Google Shopping Products dataset.""" + if self._google_shopping_products is None: + self._google_shopping_products = GoogleShoppingProducts(self._engine) + return self._google_shopping_products + + @property + def google_shopping_search_us(self) -> GoogleShoppingSearchUS: + """Google Shopping Search US dataset.""" + if self._google_shopping_search_us is None: + self._google_shopping_search_us = GoogleShoppingSearchUS(self._engine) + return self._google_shopping_search_us + + @property + def mercadolivre_products(self) -> MercadolivreProducts: + """MercadoLivre Products dataset.""" + if self._mercadolivre_products is None: + self._mercadolivre_products = MercadolivreProducts(self._engine) + return self._mercadolivre_products + + @property + def naver_products(self) -> NaverProducts: + """Naver Products dataset.""" + if self._naver_products is None: + self._naver_products = NaverProducts(self._engine) + return self._naver_products + + @property + def lazada_reviews(self) -> LazadaReviews: + """Lazada Reviews dataset.""" + if self._lazada_reviews is None: + self._lazada_reviews = LazadaReviews(self._engine) + return self._lazada_reviews + + @property + def lazada_products_search(self) -> LazadaProductsSearch: + """Lazada Products Search dataset.""" + if self._lazada_products_search is None: + self._lazada_products_search = LazadaProductsSearch(self._engine) + return self._lazada_products_search + + @property + def homedepot_us_products(self) -> HomeDepotUSProducts: + """Home Depot US Products dataset.""" + if self._homedepot_us_products is None: + self._homedepot_us_products = HomeDepotUSProducts(self._engine) + return self._homedepot_us_products + + @property + def homedepot_ca_products(self) -> HomeDepotCAProducts: + """Home Depot Canada Products dataset.""" + if self._homedepot_ca_products is None: + self._homedepot_ca_products = HomeDepotCAProducts(self._engine) + return self._homedepot_ca_products + + @property + def lowes_products(self) -> LowesProducts: + """Lowes Products dataset.""" + if self._lowes_products is None: + self._lowes_products = LowesProducts(self._engine) + return self._lowes_products + + @property + def rona_products(self) -> RonaProducts: + """Rona.ca Products dataset.""" + if self._rona_products is None: + self._rona_products = RonaProducts(self._engine) + return self._rona_products + + @property + def kroger_products(self) -> KrogerProducts: + """Kroger Products dataset.""" + if self._kroger_products is None: + self._kroger_products = KrogerProducts(self._engine) + return self._kroger_products + + @property + def macys_products(self) -> MacysProducts: + """Macys Products dataset.""" + if self._macys_products is None: + self._macys_products = MacysProducts(self._engine) + return self._macys_products + + @property + def costco_products(self) -> CostcoProducts: + """Costco Products dataset.""" + if self._costco_products is None: + self._costco_products = CostcoProducts(self._engine) + return self._costco_products + + @property + def bh_products(self) -> BHProducts: + """B&H Products dataset.""" + if self._bh_products is None: + self._bh_products = BHProducts(self._engine) + return self._bh_products + + @property + def microcenter_products(self) -> MicroCenterProducts: + """Micro Center Products dataset.""" + if self._microcenter_products is None: + self._microcenter_products = MicroCenterProducts(self._engine) + return self._microcenter_products + + @property + def autozone_products(self) -> AutozoneProducts: + """AutoZone Products dataset.""" + if self._autozone_products is None: + self._autozone_products = AutozoneProducts(self._engine) + return self._autozone_products + + # --- New dataset properties - Real Estate/Travel --- + + @property + def zillow_price_history(self) -> ZillowPriceHistory: + """Zillow Price History dataset.""" + if self._zillow_price_history is None: + self._zillow_price_history = ZillowPriceHistory(self._engine) + return self._zillow_price_history + + @property + def zoopla_properties(self) -> ZooplaProperties: + """Zoopla Properties dataset.""" + if self._zoopla_properties is None: + self._zoopla_properties = ZooplaProperties(self._engine) + return self._zoopla_properties + + @property + def booking_listings_search(self) -> BookingListingsSearch: + """Booking.com Listings Search dataset.""" + if self._booking_listings_search is None: + self._booking_listings_search = BookingListingsSearch(self._engine) + return self._booking_listings_search + + @property + def booking_hotel_listings(self) -> BookingHotelListings: + """Booking.com Hotel Listings dataset.""" + if self._booking_hotel_listings is None: + self._booking_hotel_listings = BookingHotelListings(self._engine) + return self._booking_hotel_listings + + @property + def realtor_international_properties(self) -> RealtorInternationalProperties: + """Realtor International Properties dataset.""" + if self._realtor_international_properties is None: + self._realtor_international_properties = RealtorInternationalProperties(self._engine) + return self._realtor_international_properties + + @property + def agoda_properties(self) -> AgodaProperties: + """Agoda Properties dataset.""" + if self._agoda_properties is None: + self._agoda_properties = AgodaProperties(self._engine) + return self._agoda_properties + + @property + def carsales_listings(self) -> CarsalesListings: + """Carsales Car Listings dataset.""" + if self._carsales_listings is None: + self._carsales_listings = CarsalesListings(self._engine) + return self._carsales_listings + + # --- New dataset properties - Finance/Maps --- + + @property + def yahoo_finance_businesses(self) -> YahooFinanceBusinesses: + """Yahoo Finance Businesses dataset.""" + if self._yahoo_finance_businesses is None: + self._yahoo_finance_businesses = YahooFinanceBusinesses(self._engine) + return self._yahoo_finance_businesses + + @property + def google_maps_full_info(self) -> GoogleMapsFullInfo: + """Google Maps Full Info dataset.""" + if self._google_maps_full_info is None: + self._google_maps_full_info = GoogleMapsFullInfo(self._engine) + return self._google_maps_full_info diff --git a/src/brightdata/datasets/cnn/__init__.py b/src/brightdata/datasets/cnn/__init__.py new file mode 100644 index 0000000..bebafb7 --- /dev/null +++ b/src/brightdata/datasets/cnn/__init__.py @@ -0,0 +1,5 @@ +"""CNN dataset.""" + +from .news import CNNNews + +__all__ = ["CNNNews"] diff --git a/src/brightdata/datasets/cnn/news.py b/src/brightdata/datasets/cnn/news.py new file mode 100644 index 0000000..ad3e4f4 --- /dev/null +++ b/src/brightdata/datasets/cnn/news.py @@ -0,0 +1,25 @@ +""" +CNN News dataset. + +CNN news articles dataset. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class CNNNews(BaseDataset): + """CNNNews dataset.""" + + DATASET_ID = "gd_lycz8783197ch4wvwg" + NAME = "cnn_news" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/costco/__init__.py b/src/brightdata/datasets/costco/__init__.py new file mode 100644 index 0000000..c2378b2 --- /dev/null +++ b/src/brightdata/datasets/costco/__init__.py @@ -0,0 +1,5 @@ +"""Costco dataset.""" + +from .products import CostcoProducts + +__all__ = ["CostcoProducts"] diff --git a/src/brightdata/datasets/costco/products.py b/src/brightdata/datasets/costco/products.py new file mode 100644 index 0000000..6b0ed32 --- /dev/null +++ b/src/brightdata/datasets/costco/products.py @@ -0,0 +1,25 @@ +""" +Costco Products dataset. + +Costco products dataset. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class CostcoProducts(BaseDataset): + """CostcoProducts dataset.""" + + DATASET_ID = "gd_mkcbmac44j178pook" + NAME = "costco_products" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/creative_commons/__init__.py b/src/brightdata/datasets/creative_commons/__init__.py new file mode 100644 index 0000000..9ca4091 --- /dev/null +++ b/src/brightdata/datasets/creative_commons/__init__.py @@ -0,0 +1,6 @@ +"""Creative Commons datasets.""" + +from .images import CreativeCommonsImages +from .models_3d import CreativeCommons3DModels + +__all__ = ["CreativeCommonsImages", "CreativeCommons3DModels"] diff --git a/src/brightdata/datasets/creative_commons/images.py b/src/brightdata/datasets/creative_commons/images.py new file mode 100644 index 0000000..f416875 --- /dev/null +++ b/src/brightdata/datasets/creative_commons/images.py @@ -0,0 +1,25 @@ +""" +Creative Commons Images dataset. + +Creative Commons images dataset. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class CreativeCommonsImages(BaseDataset): + """CreativeCommonsImages dataset.""" + + DATASET_ID = "gd_m23cxdw82ct6k022y3" + NAME = "creative_commons_images" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/creative_commons/models_3d.py b/src/brightdata/datasets/creative_commons/models_3d.py new file mode 100644 index 0000000..0aa9448 --- /dev/null +++ b/src/brightdata/datasets/creative_commons/models_3d.py @@ -0,0 +1,25 @@ +""" +Creative Commons 3D Models dataset. + +Creative Commons 3D models dataset. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class CreativeCommons3DModels(BaseDataset): + """CreativeCommons3DModels dataset.""" + + DATASET_ID = "gd_m4jr2hyr2kfhtvba6e" + NAME = "creative_commons_3d_models" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/ebay/__init__.py b/src/brightdata/datasets/ebay/__init__.py new file mode 100644 index 0000000..015843f --- /dev/null +++ b/src/brightdata/datasets/ebay/__init__.py @@ -0,0 +1,5 @@ +"""eBay dataset.""" + +from .products import EbayProducts + +__all__ = ["EbayProducts"] diff --git a/src/brightdata/datasets/ebay/products.py b/src/brightdata/datasets/ebay/products.py new file mode 100644 index 0000000..f1b189e --- /dev/null +++ b/src/brightdata/datasets/ebay/products.py @@ -0,0 +1,25 @@ +""" +eBay Products dataset. + +eBay products dataset. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class EbayProducts(BaseDataset): + """EbayProducts dataset.""" + + DATASET_ID = "gd_ltr9mjt81n0zzdk1fb" + NAME = "ebay_products" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/etsy/__init__.py b/src/brightdata/datasets/etsy/__init__.py new file mode 100644 index 0000000..05cc34d --- /dev/null +++ b/src/brightdata/datasets/etsy/__init__.py @@ -0,0 +1,5 @@ +"""Etsy dataset.""" + +from .products import EtsyProducts + +__all__ = ["EtsyProducts"] diff --git a/src/brightdata/datasets/etsy/products.py b/src/brightdata/datasets/etsy/products.py new file mode 100644 index 0000000..1243eef --- /dev/null +++ b/src/brightdata/datasets/etsy/products.py @@ -0,0 +1,25 @@ +""" +Etsy Products dataset. + +Etsy products dataset. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class EtsyProducts(BaseDataset): + """EtsyProducts dataset.""" + + DATASET_ID = "gd_ltppk0jdv1jqz25mz" + NAME = "etsy_products" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/facebook/__init__.py b/src/brightdata/datasets/facebook/__init__.py index 0d943ef..2a07c8a 100644 --- a/src/brightdata/datasets/facebook/__init__.py +++ b/src/brightdata/datasets/facebook/__init__.py @@ -1,5 +1,25 @@ """Facebook datasets.""" from .pages_posts import FacebookPagesPosts +from .comments import FacebookComments +from .posts_by_url import FacebookPostsByUrl +from .reels import FacebookReels +from .marketplace import FacebookMarketplace +from .company_reviews import FacebookCompanyReviews +from .events import FacebookEvents +from .profiles import FacebookProfiles +from .pages_profiles import FacebookPagesProfiles +from .group_posts import FacebookGroupPosts -__all__ = ["FacebookPagesPosts"] +__all__ = [ + "FacebookPagesPosts", + "FacebookComments", + "FacebookPostsByUrl", + "FacebookReels", + "FacebookMarketplace", + "FacebookCompanyReviews", + "FacebookEvents", + "FacebookProfiles", + "FacebookPagesProfiles", + "FacebookGroupPosts", +] diff --git a/src/brightdata/datasets/facebook/comments.py b/src/brightdata/datasets/facebook/comments.py new file mode 100644 index 0000000..07ca75a --- /dev/null +++ b/src/brightdata/datasets/facebook/comments.py @@ -0,0 +1,25 @@ +""" +Facebook Comments dataset. + +Comments from Facebook posts including replies, reactions, and engagement data. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class FacebookComments(BaseDataset): + """FacebookComments dataset.""" + + DATASET_ID = "gd_lkay758p1eanlolqw8" + NAME = "facebook_comments" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/facebook/company_reviews.py b/src/brightdata/datasets/facebook/company_reviews.py new file mode 100644 index 0000000..6a475dc --- /dev/null +++ b/src/brightdata/datasets/facebook/company_reviews.py @@ -0,0 +1,25 @@ +""" +Facebook Company Reviews dataset. + +Company reviews from Facebook pages with ratings and reviewer details. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class FacebookCompanyReviews(BaseDataset): + """FacebookCompanyReviews dataset.""" + + DATASET_ID = "gd_m0dtqpiu1mbcyc2g86" + NAME = "facebook_company_reviews" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/facebook/events.py b/src/brightdata/datasets/facebook/events.py new file mode 100644 index 0000000..21db6c4 --- /dev/null +++ b/src/brightdata/datasets/facebook/events.py @@ -0,0 +1,25 @@ +""" +Facebook Events dataset. + +Facebook events with dates, locations, descriptions, and attendee counts. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class FacebookEvents(BaseDataset): + """FacebookEvents dataset.""" + + DATASET_ID = "gd_m14sd0to1jz48ppm51" + NAME = "facebook_events" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/facebook/group_posts.py b/src/brightdata/datasets/facebook/group_posts.py new file mode 100644 index 0000000..abfadc6 --- /dev/null +++ b/src/brightdata/datasets/facebook/group_posts.py @@ -0,0 +1,25 @@ +""" +Facebook Group Posts dataset. + +Posts from Facebook groups with content, reactions, and comment counts. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class FacebookGroupPosts(BaseDataset): + """FacebookGroupPosts dataset.""" + + DATASET_ID = "gd_lz11l67o2cb3r0lkj3" + NAME = "facebook_group_posts" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/facebook/marketplace.py b/src/brightdata/datasets/facebook/marketplace.py new file mode 100644 index 0000000..e240930 --- /dev/null +++ b/src/brightdata/datasets/facebook/marketplace.py @@ -0,0 +1,25 @@ +""" +Facebook Marketplace dataset. + +Facebook Marketplace listings with product details, pricing, and seller information. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class FacebookMarketplace(BaseDataset): + """FacebookMarketplace dataset.""" + + DATASET_ID = "gd_lvt9iwuh6fbcwmx1a" + NAME = "facebook_marketplace" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/facebook/pages_profiles.py b/src/brightdata/datasets/facebook/pages_profiles.py new file mode 100644 index 0000000..9094631 --- /dev/null +++ b/src/brightdata/datasets/facebook/pages_profiles.py @@ -0,0 +1,25 @@ +""" +Facebook Pages Profiles dataset. + +Facebook page profiles with page details, follower counts, and category information. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class FacebookPagesProfiles(BaseDataset): + """FacebookPagesProfiles dataset.""" + + DATASET_ID = "gd_mf124a0511bauquyow" + NAME = "facebook_pages_profiles" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/facebook/posts_by_url.py b/src/brightdata/datasets/facebook/posts_by_url.py new file mode 100644 index 0000000..76e33b4 --- /dev/null +++ b/src/brightdata/datasets/facebook/posts_by_url.py @@ -0,0 +1,25 @@ +""" +Facebook Posts By URL dataset. + +Facebook posts collected by direct URL with full post content and metadata. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class FacebookPostsByUrl(BaseDataset): + """FacebookPostsByUrl dataset.""" + + DATASET_ID = "gd_lyclm1571iy3mv57zw" + NAME = "facebook_posts_by_url" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/facebook/profiles.py b/src/brightdata/datasets/facebook/profiles.py new file mode 100644 index 0000000..8a18e0c --- /dev/null +++ b/src/brightdata/datasets/facebook/profiles.py @@ -0,0 +1,25 @@ +""" +Facebook Profiles dataset. + +Facebook user profiles with personal info, friends count, and activity data. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class FacebookProfiles(BaseDataset): + """FacebookProfiles dataset.""" + + DATASET_ID = "gd_mf0urb782734ik94dz" + NAME = "facebook_profiles" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/facebook/reels.py b/src/brightdata/datasets/facebook/reels.py new file mode 100644 index 0000000..78ad7f4 --- /dev/null +++ b/src/brightdata/datasets/facebook/reels.py @@ -0,0 +1,25 @@ +""" +Facebook Reels dataset. + +Facebook Reels video content with views, reactions, and engagement metrics. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class FacebookReels(BaseDataset): + """FacebookReels dataset.""" + + DATASET_ID = "gd_lyclm3ey2q6rww027t" + NAME = "facebook_reels" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/github/__init__.py b/src/brightdata/datasets/github/__init__.py new file mode 100644 index 0000000..5df301f --- /dev/null +++ b/src/brightdata/datasets/github/__init__.py @@ -0,0 +1,5 @@ +"""GitHub dataset.""" + +from .repositories import GithubRepositories + +__all__ = ["GithubRepositories"] diff --git a/src/brightdata/datasets/github/repositories.py b/src/brightdata/datasets/github/repositories.py new file mode 100644 index 0000000..b72ec19 --- /dev/null +++ b/src/brightdata/datasets/github/repositories.py @@ -0,0 +1,25 @@ +""" +GitHub Repositories dataset. + +GitHub repository information dataset. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class GithubRepositories(BaseDataset): + """GithubRepositories dataset.""" + + DATASET_ID = "gd_lyrexgxc24b3d4imjt" + NAME = "github_repositories" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/google_maps/__init__.py b/src/brightdata/datasets/google_maps/__init__.py index 412883b..1453afb 100644 --- a/src/brightdata/datasets/google_maps/__init__.py +++ b/src/brightdata/datasets/google_maps/__init__.py @@ -1,5 +1,9 @@ """Google Maps datasets.""" from .reviews import GoogleMapsReviews +from .full_info import GoogleMapsFullInfo -__all__ = ["GoogleMapsReviews"] +__all__ = [ + "GoogleMapsReviews", + "GoogleMapsFullInfo", +] diff --git a/src/brightdata/datasets/google_maps/full_info.py b/src/brightdata/datasets/google_maps/full_info.py new file mode 100644 index 0000000..50f9b38 --- /dev/null +++ b/src/brightdata/datasets/google_maps/full_info.py @@ -0,0 +1,25 @@ +""" +Google Maps Full Info dataset. + +Complete Google Maps business information with details, hours, and contact data. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class GoogleMapsFullInfo(BaseDataset): + """GoogleMapsFullInfo dataset.""" + + DATASET_ID = "gd_m8ebnr0q2qlklc02fz" + NAME = "google_maps_full_info" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/google_news/__init__.py b/src/brightdata/datasets/google_news/__init__.py new file mode 100644 index 0000000..270932b --- /dev/null +++ b/src/brightdata/datasets/google_news/__init__.py @@ -0,0 +1,5 @@ +"""Google News dataset.""" + +from .news import GoogleNews + +__all__ = ["GoogleNews"] diff --git a/src/brightdata/datasets/google_news/news.py b/src/brightdata/datasets/google_news/news.py new file mode 100644 index 0000000..984628e --- /dev/null +++ b/src/brightdata/datasets/google_news/news.py @@ -0,0 +1,25 @@ +""" +Google News dataset. + +Google News articles dataset. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class GoogleNews(BaseDataset): + """GoogleNews dataset.""" + + DATASET_ID = "gd_lnsxoxzi1omrwnka5r" + NAME = "google_news" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/google_play/__init__.py b/src/brightdata/datasets/google_play/__init__.py new file mode 100644 index 0000000..ee96916 --- /dev/null +++ b/src/brightdata/datasets/google_play/__init__.py @@ -0,0 +1,6 @@ +"""Google Play datasets.""" + +from .store import GooglePlayStore +from .reviews import GooglePlayReviews + +__all__ = ["GooglePlayStore", "GooglePlayReviews"] diff --git a/src/brightdata/datasets/google_play/reviews.py b/src/brightdata/datasets/google_play/reviews.py new file mode 100644 index 0000000..2ee94c6 --- /dev/null +++ b/src/brightdata/datasets/google_play/reviews.py @@ -0,0 +1,25 @@ +""" +Google Play Reviews dataset. + +Google Play Store reviews dataset. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class GooglePlayReviews(BaseDataset): + """GooglePlayReviews dataset.""" + + DATASET_ID = "gd_m6zagkt024uwvvwuyu" + NAME = "google_play_reviews" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/google_play/store.py b/src/brightdata/datasets/google_play/store.py new file mode 100644 index 0000000..5604d80 --- /dev/null +++ b/src/brightdata/datasets/google_play/store.py @@ -0,0 +1,25 @@ +""" +Google Play Store dataset. + +Google Play Store apps dataset. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class GooglePlayStore(BaseDataset): + """GooglePlayStore dataset.""" + + DATASET_ID = "gd_lsk382l8xei8vzm4u" + NAME = "google_play_store" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/google_shopping/__init__.py b/src/brightdata/datasets/google_shopping/__init__.py new file mode 100644 index 0000000..dec48ea --- /dev/null +++ b/src/brightdata/datasets/google_shopping/__init__.py @@ -0,0 +1,6 @@ +"""Google Shopping datasets.""" + +from .products import GoogleShoppingProducts +from .search_us import GoogleShoppingSearchUS + +__all__ = ["GoogleShoppingProducts", "GoogleShoppingSearchUS"] diff --git a/src/brightdata/datasets/google_shopping/products.py b/src/brightdata/datasets/google_shopping/products.py new file mode 100644 index 0000000..ed6525a --- /dev/null +++ b/src/brightdata/datasets/google_shopping/products.py @@ -0,0 +1,25 @@ +""" +Google Shopping Products dataset. + +Google Shopping products dataset. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class GoogleShoppingProducts(BaseDataset): + """GoogleShoppingProducts dataset.""" + + DATASET_ID = "gd_ltppk50q18kdw67omz" + NAME = "google_shopping_products" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/google_shopping/search_us.py b/src/brightdata/datasets/google_shopping/search_us.py new file mode 100644 index 0000000..142b349 --- /dev/null +++ b/src/brightdata/datasets/google_shopping/search_us.py @@ -0,0 +1,25 @@ +""" +Google Shopping Search US dataset. + +Google Shopping products search US dataset. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class GoogleShoppingSearchUS(BaseDataset): + """GoogleShoppingSearchUS dataset.""" + + DATASET_ID = "gd_m31f2k0d2m1bah4f3b" + NAME = "google_shopping_search_us" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/homedepot/__init__.py b/src/brightdata/datasets/homedepot/__init__.py new file mode 100644 index 0000000..d7fa1fc --- /dev/null +++ b/src/brightdata/datasets/homedepot/__init__.py @@ -0,0 +1,6 @@ +"""Home Depot datasets.""" + +from .products_ca import HomeDepotCAProducts +from .products_us import HomeDepotUSProducts + +__all__ = ["HomeDepotUSProducts", "HomeDepotCAProducts"] diff --git a/src/brightdata/datasets/homedepot/products_ca.py b/src/brightdata/datasets/homedepot/products_ca.py new file mode 100644 index 0000000..35d5c4b --- /dev/null +++ b/src/brightdata/datasets/homedepot/products_ca.py @@ -0,0 +1,25 @@ +""" +Home Depot Canada Products dataset. + +Home Depot Canada products dataset. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class HomeDepotCAProducts(BaseDataset): + """HomeDepotCAProducts dataset.""" + + DATASET_ID = "gd_lmyvvktscoojdor83" + NAME = "homedepot_ca_products" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/homedepot/products_us.py b/src/brightdata/datasets/homedepot/products_us.py new file mode 100644 index 0000000..a2afee1 --- /dev/null +++ b/src/brightdata/datasets/homedepot/products_us.py @@ -0,0 +1,25 @@ +""" +Home Depot US Products dataset. + +Home Depot US products dataset. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class HomeDepotUSProducts(BaseDataset): + """HomeDepotUSProducts dataset.""" + + DATASET_ID = "gd_lmusivh019i7g97q2n" + NAME = "homedepot_us_products" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/instagram/__init__.py b/src/brightdata/datasets/instagram/__init__.py index 3ba3156..b8504e9 100644 --- a/src/brightdata/datasets/instagram/__init__.py +++ b/src/brightdata/datasets/instagram/__init__.py @@ -2,5 +2,12 @@ from .profiles import InstagramProfiles from .posts import InstagramPosts +from .comments import InstagramComments +from .reels import InstagramReels -__all__ = ["InstagramProfiles", "InstagramPosts"] +__all__ = [ + "InstagramProfiles", + "InstagramPosts", + "InstagramComments", + "InstagramReels", +] diff --git a/src/brightdata/datasets/instagram/comments.py b/src/brightdata/datasets/instagram/comments.py new file mode 100644 index 0000000..ea90da7 --- /dev/null +++ b/src/brightdata/datasets/instagram/comments.py @@ -0,0 +1,25 @@ +""" +Instagram Comments dataset. + +Comments from Instagram posts with text, likes, and commenter information. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class InstagramComments(BaseDataset): + """InstagramComments dataset.""" + + DATASET_ID = "gd_ltppn085pokosxh13" + NAME = "instagram_comments" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/instagram/reels.py b/src/brightdata/datasets/instagram/reels.py new file mode 100644 index 0000000..78135a6 --- /dev/null +++ b/src/brightdata/datasets/instagram/reels.py @@ -0,0 +1,25 @@ +""" +Instagram Reels dataset. + +Instagram Reels video content with views, likes, and engagement data. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class InstagramReels(BaseDataset): + """InstagramReels dataset.""" + + DATASET_ID = "gd_lyclm20il4r5helnj" + NAME = "instagram_reels" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/kroger/__init__.py b/src/brightdata/datasets/kroger/__init__.py new file mode 100644 index 0000000..d5f0365 --- /dev/null +++ b/src/brightdata/datasets/kroger/__init__.py @@ -0,0 +1,5 @@ +"""Kroger dataset.""" + +from .products import KrogerProducts + +__all__ = ["KrogerProducts"] diff --git a/src/brightdata/datasets/kroger/products.py b/src/brightdata/datasets/kroger/products.py new file mode 100644 index 0000000..1a7cca0 --- /dev/null +++ b/src/brightdata/datasets/kroger/products.py @@ -0,0 +1,25 @@ +""" +Kroger Products dataset. + +Kroger.com products dataset. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class KrogerProducts(BaseDataset): + """KrogerProducts dataset.""" + + DATASET_ID = "gd_mhlod8vh2kwgoi9yw3" + NAME = "kroger_products" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/lazada/__init__.py b/src/brightdata/datasets/lazada/__init__.py index 6b26803..897bb2d 100644 --- a/src/brightdata/datasets/lazada/__init__.py +++ b/src/brightdata/datasets/lazada/__init__.py @@ -1,5 +1,11 @@ """Lazada datasets.""" from .products import LazadaProducts +from .reviews import LazadaReviews +from .products_search import LazadaProductsSearch -__all__ = ["LazadaProducts"] +__all__ = [ + "LazadaProducts", + "LazadaReviews", + "LazadaProductsSearch", +] diff --git a/src/brightdata/datasets/lazada/products_search.py b/src/brightdata/datasets/lazada/products_search.py new file mode 100644 index 0000000..3d36d04 --- /dev/null +++ b/src/brightdata/datasets/lazada/products_search.py @@ -0,0 +1,25 @@ +""" +Lazada Products Search dataset. + +Lazada product search results with listings, prices, and seller data. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class LazadaProductsSearch(BaseDataset): + """LazadaProductsSearch dataset.""" + + DATASET_ID = "gd_lwd9icor28eg4srnxi" + NAME = "lazada_products_search" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/lazada/reviews.py b/src/brightdata/datasets/lazada/reviews.py new file mode 100644 index 0000000..dedcdf2 --- /dev/null +++ b/src/brightdata/datasets/lazada/reviews.py @@ -0,0 +1,25 @@ +""" +Lazada Reviews dataset. + +Product reviews from Lazada with ratings, text, and reviewer information. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class LazadaReviews(BaseDataset): + """LazadaReviews dataset.""" + + DATASET_ID = "gd_lub6mys21lzcklkq1z" + NAME = "lazada_reviews" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/linkedin/__init__.py b/src/brightdata/datasets/linkedin/__init__.py index 683fa7b..b123902 100644 --- a/src/brightdata/datasets/linkedin/__init__.py +++ b/src/brightdata/datasets/linkedin/__init__.py @@ -3,5 +3,13 @@ from .people_profiles import LinkedInPeopleProfiles from .company_profiles import LinkedInCompanyProfiles from .job_listings import LinkedInJobListings +from .posts import LinkedInPosts +from .profiles_job_listings import LinkedInProfilesJobListings -__all__ = ["LinkedInPeopleProfiles", "LinkedInCompanyProfiles", "LinkedInJobListings"] +__all__ = [ + "LinkedInPeopleProfiles", + "LinkedInCompanyProfiles", + "LinkedInJobListings", + "LinkedInPosts", + "LinkedInProfilesJobListings", +] diff --git a/src/brightdata/datasets/linkedin/posts.py b/src/brightdata/datasets/linkedin/posts.py new file mode 100644 index 0000000..38980b2 --- /dev/null +++ b/src/brightdata/datasets/linkedin/posts.py @@ -0,0 +1,25 @@ +""" +LinkedIn Posts dataset. + +LinkedIn posts with content, reactions, comments, and engagement metrics. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class LinkedInPosts(BaseDataset): + """LinkedInPosts dataset.""" + + DATASET_ID = "gd_lyy3tktm25m4avu764" + NAME = "linkedin_posts" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/linkedin/profiles_job_listings.py b/src/brightdata/datasets/linkedin/profiles_job_listings.py new file mode 100644 index 0000000..97e0e1e --- /dev/null +++ b/src/brightdata/datasets/linkedin/profiles_job_listings.py @@ -0,0 +1,25 @@ +""" +LinkedIn Profiles Job Listings dataset. + +Job listings associated with LinkedIn profiles including role details and requirements. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class LinkedInProfilesJobListings(BaseDataset): + """LinkedInProfilesJobListings dataset.""" + + DATASET_ID = "gd_m487ihp32jtc4ujg45" + NAME = "linkedin_profiles_job_listings" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/lowes/__init__.py b/src/brightdata/datasets/lowes/__init__.py new file mode 100644 index 0000000..6a06fdf --- /dev/null +++ b/src/brightdata/datasets/lowes/__init__.py @@ -0,0 +1,5 @@ +"""Lowes dataset.""" + +from .products import LowesProducts + +__all__ = ["LowesProducts"] diff --git a/src/brightdata/datasets/lowes/products.py b/src/brightdata/datasets/lowes/products.py new file mode 100644 index 0000000..fd41c76 --- /dev/null +++ b/src/brightdata/datasets/lowes/products.py @@ -0,0 +1,25 @@ +""" +Lowes Products dataset. + +Lowes.com products dataset. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class LowesProducts(BaseDataset): + """LowesProducts dataset.""" + + DATASET_ID = "gd_lnvl79pfftqh18u2o" + NAME = "lowes_products" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/macys/__init__.py b/src/brightdata/datasets/macys/__init__.py new file mode 100644 index 0000000..802cc87 --- /dev/null +++ b/src/brightdata/datasets/macys/__init__.py @@ -0,0 +1,5 @@ +"""Macys dataset.""" + +from .products import MacysProducts + +__all__ = ["MacysProducts"] diff --git a/src/brightdata/datasets/macys/products.py b/src/brightdata/datasets/macys/products.py new file mode 100644 index 0000000..8c862a2 --- /dev/null +++ b/src/brightdata/datasets/macys/products.py @@ -0,0 +1,25 @@ +""" +Macys Products dataset. + +Macys.com products dataset. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class MacysProducts(BaseDataset): + """MacysProducts dataset.""" + + DATASET_ID = "gd_miebqh4a18ivg65bpa" + NAME = "macys_products" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/mercadolivre/__init__.py b/src/brightdata/datasets/mercadolivre/__init__.py new file mode 100644 index 0000000..292117b --- /dev/null +++ b/src/brightdata/datasets/mercadolivre/__init__.py @@ -0,0 +1,5 @@ +"""MercadoLivre dataset.""" + +from .products import MercadolivreProducts + +__all__ = ["MercadolivreProducts"] diff --git a/src/brightdata/datasets/mercadolivre/products.py b/src/brightdata/datasets/mercadolivre/products.py new file mode 100644 index 0000000..948a7c8 --- /dev/null +++ b/src/brightdata/datasets/mercadolivre/products.py @@ -0,0 +1,25 @@ +""" +MercadoLivre Products dataset. + +MercadoLivre products dataset. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class MercadolivreProducts(BaseDataset): + """MercadolivreProducts dataset.""" + + DATASET_ID = "gd_m7re62tb1w88ymy86r" + NAME = "mercadolivre_products" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/microcenter/__init__.py b/src/brightdata/datasets/microcenter/__init__.py new file mode 100644 index 0000000..d7f431c --- /dev/null +++ b/src/brightdata/datasets/microcenter/__init__.py @@ -0,0 +1,5 @@ +"""Micro Center dataset.""" + +from .products import MicroCenterProducts + +__all__ = ["MicroCenterProducts"] diff --git a/src/brightdata/datasets/microcenter/products.py b/src/brightdata/datasets/microcenter/products.py new file mode 100644 index 0000000..c377913 --- /dev/null +++ b/src/brightdata/datasets/microcenter/products.py @@ -0,0 +1,25 @@ +""" +Micro Center Products dataset. + +Micro Center products dataset. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class MicroCenterProducts(BaseDataset): + """MicroCenterProducts dataset.""" + + DATASET_ID = "gd_mkckexq2uquhupguv" + NAME = "microcenter_products" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/myntra/__init__.py b/src/brightdata/datasets/myntra/__init__.py new file mode 100644 index 0000000..dd89e2f --- /dev/null +++ b/src/brightdata/datasets/myntra/__init__.py @@ -0,0 +1,5 @@ +"""Myntra dataset.""" + +from .products import MyntraProducts + +__all__ = ["MyntraProducts"] diff --git a/src/brightdata/datasets/myntra/products.py b/src/brightdata/datasets/myntra/products.py new file mode 100644 index 0000000..adc5702 --- /dev/null +++ b/src/brightdata/datasets/myntra/products.py @@ -0,0 +1,25 @@ +""" +Myntra Products dataset. + +Myntra products dataset. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class MyntraProducts(BaseDataset): + """MyntraProducts dataset.""" + + DATASET_ID = "gd_lptvxr8b1qx1d9thgp" + NAME = "myntra_products" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/naver/__init__.py b/src/brightdata/datasets/naver/__init__.py new file mode 100644 index 0000000..5face3f --- /dev/null +++ b/src/brightdata/datasets/naver/__init__.py @@ -0,0 +1,5 @@ +"""Naver dataset.""" + +from .products import NaverProducts + +__all__ = ["NaverProducts"] diff --git a/src/brightdata/datasets/naver/products.py b/src/brightdata/datasets/naver/products.py new file mode 100644 index 0000000..804daa6 --- /dev/null +++ b/src/brightdata/datasets/naver/products.py @@ -0,0 +1,25 @@ +""" +Naver Products dataset. + +Naver products dataset. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class NaverProducts(BaseDataset): + """NaverProducts dataset.""" + + DATASET_ID = "gd_m9qqjxxr1hab7okefj" + NAME = "naver_products" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/ozon/__init__.py b/src/brightdata/datasets/ozon/__init__.py new file mode 100644 index 0000000..81111a9 --- /dev/null +++ b/src/brightdata/datasets/ozon/__init__.py @@ -0,0 +1,5 @@ +"""Ozon dataset.""" + +from .products import OzonProducts + +__all__ = ["OzonProducts"] diff --git a/src/brightdata/datasets/ozon/products.py b/src/brightdata/datasets/ozon/products.py new file mode 100644 index 0000000..9b1fcab --- /dev/null +++ b/src/brightdata/datasets/ozon/products.py @@ -0,0 +1,25 @@ +""" +Ozon Products dataset. + +Ozon.ru products dataset. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class OzonProducts(BaseDataset): + """OzonProducts dataset.""" + + DATASET_ID = "gd_lutq85sl13rlndbzai" + NAME = "ozon_products" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/quora/__init__.py b/src/brightdata/datasets/quora/__init__.py new file mode 100644 index 0000000..3e5554c --- /dev/null +++ b/src/brightdata/datasets/quora/__init__.py @@ -0,0 +1,5 @@ +"""Quora dataset.""" + +from .posts import QuoraPosts + +__all__ = ["QuoraPosts"] diff --git a/src/brightdata/datasets/quora/posts.py b/src/brightdata/datasets/quora/posts.py new file mode 100644 index 0000000..992b1c0 --- /dev/null +++ b/src/brightdata/datasets/quora/posts.py @@ -0,0 +1,25 @@ +""" +Quora Posts dataset. + +Quora posts dataset. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class QuoraPosts(BaseDataset): + """QuoraPosts dataset.""" + + DATASET_ID = "gd_lvz1rbj81afv3m6n5y" + NAME = "quora_posts" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/realtor/__init__.py b/src/brightdata/datasets/realtor/__init__.py new file mode 100644 index 0000000..d9039ef --- /dev/null +++ b/src/brightdata/datasets/realtor/__init__.py @@ -0,0 +1,5 @@ +"""Realtor dataset.""" + +from .international_properties import RealtorInternationalProperties + +__all__ = ["RealtorInternationalProperties"] diff --git a/src/brightdata/datasets/realtor/international_properties.py b/src/brightdata/datasets/realtor/international_properties.py new file mode 100644 index 0000000..47987b6 --- /dev/null +++ b/src/brightdata/datasets/realtor/international_properties.py @@ -0,0 +1,25 @@ +""" +Realtor International Properties dataset. + +Realtor international property listings dataset. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class RealtorInternationalProperties(BaseDataset): + """RealtorInternationalProperties dataset.""" + + DATASET_ID = "gd_m517agnc1jppzwgtmw" + NAME = "realtor_international_properties" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/reddit/__init__.py b/src/brightdata/datasets/reddit/__init__.py new file mode 100644 index 0000000..2c5cab3 --- /dev/null +++ b/src/brightdata/datasets/reddit/__init__.py @@ -0,0 +1,6 @@ +"""Reddit datasets.""" + +from .posts import RedditPosts +from .comments import RedditComments + +__all__ = ["RedditPosts", "RedditComments"] diff --git a/src/brightdata/datasets/reddit/comments.py b/src/brightdata/datasets/reddit/comments.py new file mode 100644 index 0000000..bf1d807 --- /dev/null +++ b/src/brightdata/datasets/reddit/comments.py @@ -0,0 +1,25 @@ +""" +Reddit Comments dataset. + +Reddit comments dataset. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class RedditComments(BaseDataset): + """RedditComments dataset.""" + + DATASET_ID = "gd_lvzdpsdlw09j6t702" + NAME = "reddit_comments" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/reddit/posts.py b/src/brightdata/datasets/reddit/posts.py new file mode 100644 index 0000000..6d0401f --- /dev/null +++ b/src/brightdata/datasets/reddit/posts.py @@ -0,0 +1,25 @@ +""" +Reddit Posts dataset. + +Reddit posts dataset. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class RedditPosts(BaseDataset): + """RedditPosts dataset.""" + + DATASET_ID = "gd_lvz8ah06191smkebj4" + NAME = "reddit_posts" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/rona/__init__.py b/src/brightdata/datasets/rona/__init__.py new file mode 100644 index 0000000..8afa648 --- /dev/null +++ b/src/brightdata/datasets/rona/__init__.py @@ -0,0 +1,5 @@ +"""Rona dataset.""" + +from .products import RonaProducts + +__all__ = ["RonaProducts"] diff --git a/src/brightdata/datasets/rona/products.py b/src/brightdata/datasets/rona/products.py new file mode 100644 index 0000000..089e739 --- /dev/null +++ b/src/brightdata/datasets/rona/products.py @@ -0,0 +1,25 @@ +""" +Rona Products dataset. + +Rona.ca products dataset. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class RonaProducts(BaseDataset): + """RonaProducts dataset.""" + + DATASET_ID = "gd_mf53alwg1a35fv69pw" + NAME = "rona_products" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/snapchat/__init__.py b/src/brightdata/datasets/snapchat/__init__.py new file mode 100644 index 0000000..b310bb1 --- /dev/null +++ b/src/brightdata/datasets/snapchat/__init__.py @@ -0,0 +1,5 @@ +"""Snapchat dataset.""" + +from .posts import SnapchatPosts + +__all__ = ["SnapchatPosts"] diff --git a/src/brightdata/datasets/snapchat/posts.py b/src/brightdata/datasets/snapchat/posts.py new file mode 100644 index 0000000..695ffc5 --- /dev/null +++ b/src/brightdata/datasets/snapchat/posts.py @@ -0,0 +1,25 @@ +""" +Snapchat Posts dataset. + +Snapchat posts dataset. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class SnapchatPosts(BaseDataset): + """SnapchatPosts dataset.""" + + DATASET_ID = "gd_ma0ydx431w6stl16ge" + NAME = "snapchat_posts" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/tiktok/__init__.py b/src/brightdata/datasets/tiktok/__init__.py index 8569218..b7b2190 100644 --- a/src/brightdata/datasets/tiktok/__init__.py +++ b/src/brightdata/datasets/tiktok/__init__.py @@ -1,5 +1,13 @@ """TikTok datasets.""" from .profiles import TikTokProfiles +from .comments import TikTokComments +from .posts import TikTokPosts +from .shop import TikTokShop -__all__ = ["TikTokProfiles"] +__all__ = [ + "TikTokProfiles", + "TikTokComments", + "TikTokPosts", + "TikTokShop", +] diff --git a/src/brightdata/datasets/tiktok/comments.py b/src/brightdata/datasets/tiktok/comments.py new file mode 100644 index 0000000..f926d9d --- /dev/null +++ b/src/brightdata/datasets/tiktok/comments.py @@ -0,0 +1,25 @@ +""" +TikTok Comments dataset. + +Comments from TikTok videos with text, likes, and reply data. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class TikTokComments(BaseDataset): + """TikTokComments dataset.""" + + DATASET_ID = "gd_lkf2st302ap89utw5k" + NAME = "tiktok_comments" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/tiktok/posts.py b/src/brightdata/datasets/tiktok/posts.py new file mode 100644 index 0000000..3158222 --- /dev/null +++ b/src/brightdata/datasets/tiktok/posts.py @@ -0,0 +1,25 @@ +""" +TikTok Posts dataset. + +TikTok video posts with captions, view counts, and engagement metrics. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class TikTokPosts(BaseDataset): + """TikTokPosts dataset.""" + + DATASET_ID = "gd_lu702nij2f790tmv9h" + NAME = "tiktok_posts" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/tiktok/shop.py b/src/brightdata/datasets/tiktok/shop.py new file mode 100644 index 0000000..6b90747 --- /dev/null +++ b/src/brightdata/datasets/tiktok/shop.py @@ -0,0 +1,25 @@ +""" +TikTok Shop dataset. + +TikTok Shop product listings with pricing, ratings, and seller details. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class TikTokShop(BaseDataset): + """TikTokShop dataset.""" + + DATASET_ID = "gd_m45m1u911dsa4274pi" + NAME = "tiktok_shop" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/tokopedia/__init__.py b/src/brightdata/datasets/tokopedia/__init__.py new file mode 100644 index 0000000..392695f --- /dev/null +++ b/src/brightdata/datasets/tokopedia/__init__.py @@ -0,0 +1,5 @@ +"""Tokopedia dataset.""" + +from .products import TokopediaProducts + +__all__ = ["TokopediaProducts"] diff --git a/src/brightdata/datasets/tokopedia/products.py b/src/brightdata/datasets/tokopedia/products.py new file mode 100644 index 0000000..8856fcd --- /dev/null +++ b/src/brightdata/datasets/tokopedia/products.py @@ -0,0 +1,25 @@ +""" +Tokopedia Products dataset. + +Tokopedia products dataset. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class TokopediaProducts(BaseDataset): + """TokopediaProducts dataset.""" + + DATASET_ID = "gd_lxk24yba297r8qd3tp" + NAME = "tokopedia_products" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/vimeo/__init__.py b/src/brightdata/datasets/vimeo/__init__.py new file mode 100644 index 0000000..5d1f245 --- /dev/null +++ b/src/brightdata/datasets/vimeo/__init__.py @@ -0,0 +1,5 @@ +"""Vimeo dataset.""" + +from .videos import VimeoVideos + +__all__ = ["VimeoVideos"] diff --git a/src/brightdata/datasets/vimeo/videos.py b/src/brightdata/datasets/vimeo/videos.py new file mode 100644 index 0000000..1a00204 --- /dev/null +++ b/src/brightdata/datasets/vimeo/videos.py @@ -0,0 +1,25 @@ +""" +Vimeo Videos dataset. + +Vimeo video posts dataset. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class VimeoVideos(BaseDataset): + """VimeoVideos dataset.""" + + DATASET_ID = "gd_lxk88z3v1ketji4pn" + NAME = "vimeo_videos" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/walmart/__init__.py b/src/brightdata/datasets/walmart/__init__.py index b7541d2..0d40e39 100644 --- a/src/brightdata/datasets/walmart/__init__.py +++ b/src/brightdata/datasets/walmart/__init__.py @@ -1,5 +1,9 @@ """Walmart datasets.""" from .products import WalmartProducts +from .sellers import WalmartSellersInfo -__all__ = ["WalmartProducts"] +__all__ = [ + "WalmartProducts", + "WalmartSellersInfo", +] diff --git a/src/brightdata/datasets/walmart/sellers.py b/src/brightdata/datasets/walmart/sellers.py new file mode 100644 index 0000000..65f9884 --- /dev/null +++ b/src/brightdata/datasets/walmart/sellers.py @@ -0,0 +1,25 @@ +""" +Walmart Sellers Info dataset. + +Walmart seller profiles with ratings, product counts, and business information. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class WalmartSellersInfo(BaseDataset): + """WalmartSellersInfo dataset.""" + + DATASET_ID = "gd_m7ke48w81ocyu4hhz0" + NAME = "walmart_sellers_info" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/wayfair/__init__.py b/src/brightdata/datasets/wayfair/__init__.py new file mode 100644 index 0000000..c3730cc --- /dev/null +++ b/src/brightdata/datasets/wayfair/__init__.py @@ -0,0 +1,5 @@ +"""Wayfair dataset.""" + +from .products import WayfairProducts + +__all__ = ["WayfairProducts"] diff --git a/src/brightdata/datasets/wayfair/products.py b/src/brightdata/datasets/wayfair/products.py new file mode 100644 index 0000000..a67cdae --- /dev/null +++ b/src/brightdata/datasets/wayfair/products.py @@ -0,0 +1,25 @@ +""" +Wayfair Products dataset. + +Wayfair products dataset. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class WayfairProducts(BaseDataset): + """WayfairProducts dataset.""" + + DATASET_ID = "gd_ltr9ne3p24zrhrbu28" + NAME = "wayfair_products" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/wikipedia/__init__.py b/src/brightdata/datasets/wikipedia/__init__.py new file mode 100644 index 0000000..1a0dc8a --- /dev/null +++ b/src/brightdata/datasets/wikipedia/__init__.py @@ -0,0 +1,5 @@ +"""Wikipedia dataset.""" + +from .articles import WikipediaArticles + +__all__ = ["WikipediaArticles"] diff --git a/src/brightdata/datasets/wikipedia/articles.py b/src/brightdata/datasets/wikipedia/articles.py new file mode 100644 index 0000000..49c4db5 --- /dev/null +++ b/src/brightdata/datasets/wikipedia/articles.py @@ -0,0 +1,25 @@ +""" +Wikipedia Articles dataset. + +Wikipedia articles dataset. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class WikipediaArticles(BaseDataset): + """WikipediaArticles dataset.""" + + DATASET_ID = "gd_lr9978962kkjr3nx49" + NAME = "wikipedia_articles" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/wildberries/__init__.py b/src/brightdata/datasets/wildberries/__init__.py new file mode 100644 index 0000000..1d8acc5 --- /dev/null +++ b/src/brightdata/datasets/wildberries/__init__.py @@ -0,0 +1,5 @@ +"""Wildberries dataset.""" + +from .products import WildberriesProducts + +__all__ = ["WildberriesProducts"] diff --git a/src/brightdata/datasets/wildberries/products.py b/src/brightdata/datasets/wildberries/products.py new file mode 100644 index 0000000..5f3b40e --- /dev/null +++ b/src/brightdata/datasets/wildberries/products.py @@ -0,0 +1,25 @@ +""" +Wildberries Products dataset. + +Wildberries.ru products dataset. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class WildberriesProducts(BaseDataset): + """WildberriesProducts dataset.""" + + DATASET_ID = "gd_luz4fboh2dicd27hhm" + NAME = "wildberries_products" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/x_twitter/__init__.py b/src/brightdata/datasets/x_twitter/__init__.py new file mode 100644 index 0000000..8b6f5de --- /dev/null +++ b/src/brightdata/datasets/x_twitter/__init__.py @@ -0,0 +1,6 @@ +"""X (Twitter) datasets.""" + +from .posts import XTwitterPosts +from .profiles import XTwitterProfiles + +__all__ = ["XTwitterPosts", "XTwitterProfiles"] diff --git a/src/brightdata/datasets/x_twitter/posts.py b/src/brightdata/datasets/x_twitter/posts.py new file mode 100644 index 0000000..4c5cdb4 --- /dev/null +++ b/src/brightdata/datasets/x_twitter/posts.py @@ -0,0 +1,25 @@ +""" +X (formerly Twitter) Posts dataset. + +X (formerly Twitter) posts dataset. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class XTwitterPosts(BaseDataset): + """XTwitterPosts dataset.""" + + DATASET_ID = "gd_lwxkxvnf1cynvib9co" + NAME = "x_twitter_posts" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/x_twitter/profiles.py b/src/brightdata/datasets/x_twitter/profiles.py new file mode 100644 index 0000000..6da2550 --- /dev/null +++ b/src/brightdata/datasets/x_twitter/profiles.py @@ -0,0 +1,25 @@ +""" +X (formerly Twitter) Profiles dataset. + +X (formerly Twitter) profiles dataset. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class XTwitterProfiles(BaseDataset): + """XTwitterProfiles dataset.""" + + DATASET_ID = "gd_lwxmeb2u1cniijd7t4" + NAME = "x_twitter_profiles" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/yahoo_finance/__init__.py b/src/brightdata/datasets/yahoo_finance/__init__.py new file mode 100644 index 0000000..037d3e5 --- /dev/null +++ b/src/brightdata/datasets/yahoo_finance/__init__.py @@ -0,0 +1,5 @@ +"""Yahoo Finance dataset.""" + +from .businesses import YahooFinanceBusinesses + +__all__ = ["YahooFinanceBusinesses"] diff --git a/src/brightdata/datasets/yahoo_finance/businesses.py b/src/brightdata/datasets/yahoo_finance/businesses.py new file mode 100644 index 0000000..8ccbfae --- /dev/null +++ b/src/brightdata/datasets/yahoo_finance/businesses.py @@ -0,0 +1,25 @@ +""" +Yahoo Finance Businesses dataset. + +Yahoo Finance business information dataset. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class YahooFinanceBusinesses(BaseDataset): + """YahooFinanceBusinesses dataset.""" + + DATASET_ID = "gd_lmrpz3vxmz972ghd7" + NAME = "yahoo_finance_businesses" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/zillow/__init__.py b/src/brightdata/datasets/zillow/__init__.py index f99a42e..23b0254 100644 --- a/src/brightdata/datasets/zillow/__init__.py +++ b/src/brightdata/datasets/zillow/__init__.py @@ -1,5 +1,9 @@ """Zillow datasets.""" from .properties import ZillowProperties +from .price_history import ZillowPriceHistory -__all__ = ["ZillowProperties"] +__all__ = [ + "ZillowProperties", + "ZillowPriceHistory", +] diff --git a/src/brightdata/datasets/zillow/price_history.py b/src/brightdata/datasets/zillow/price_history.py new file mode 100644 index 0000000..37df2f2 --- /dev/null +++ b/src/brightdata/datasets/zillow/price_history.py @@ -0,0 +1,25 @@ +""" +Zillow Price History dataset. + +Historical pricing data for Zillow properties with date and price change records. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class ZillowPriceHistory(BaseDataset): + """ZillowPriceHistory dataset.""" + + DATASET_ID = "gd_lxu1cz9r88uiqsosl" + NAME = "zillow_price_history" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/zoopla/__init__.py b/src/brightdata/datasets/zoopla/__init__.py new file mode 100644 index 0000000..095cacb --- /dev/null +++ b/src/brightdata/datasets/zoopla/__init__.py @@ -0,0 +1,5 @@ +"""Zoopla dataset.""" + +from .properties import ZooplaProperties + +__all__ = ["ZooplaProperties"] diff --git a/src/brightdata/datasets/zoopla/properties.py b/src/brightdata/datasets/zoopla/properties.py new file mode 100644 index 0000000..2038690 --- /dev/null +++ b/src/brightdata/datasets/zoopla/properties.py @@ -0,0 +1,25 @@ +""" +Zoopla Properties dataset. + +Zoopla UK property listings dataset. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class ZooplaProperties(BaseDataset): + """Zoopla Properties dataset.""" + + DATASET_ID = "gd_lnabksndfp1pegwzh" + NAME = "zoopla_properties" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/scraper_studio/__init__.py b/src/brightdata/scraper_studio/__init__.py new file mode 100644 index 0000000..54f7c7f --- /dev/null +++ b/src/brightdata/scraper_studio/__init__.py @@ -0,0 +1,5 @@ +"""Scraper Studio - trigger and fetch results from user-created custom scrapers.""" + +from .models import ScraperStudioJob, JobStatus + +__all__ = ["ScraperStudioJob", "JobStatus"] diff --git a/src/brightdata/scraper_studio/client.py b/src/brightdata/scraper_studio/client.py new file mode 100644 index 0000000..2138025 --- /dev/null +++ b/src/brightdata/scraper_studio/client.py @@ -0,0 +1,137 @@ +""" +Scraper Studio API Client - HTTP operations for Bright Data's Scraper Studio API. + +Handles all HTTP communication with Bright Data's DCA (Data Collection Automation) endpoints: +- Triggering real-time scrapes (trigger_immediate) +- Fetching real-time results (get_result) +- Checking job status (log) + +Follows the same pattern as DatasetAPIClient and AsyncUnblockerClient. +""" + +from typing import Dict, List, Any + +from ..core.engine import AsyncEngine +from ..constants import HTTP_OK, HTTP_ACCEPTED +from ..exceptions import APIError, DataNotReadyError + + +BASE_URL = "https://api.brightdata.com" + + +class ScraperStudioAPIClient: + """ + Client for Bright Data Scraper Studio API operations. + + Handles HTTP communication for DCA endpoints: + - POST /dca/trigger_immediate → trigger single-input scrape + - GET /dca/get_result → fetch real-time result + - GET /dca/log/{job_id} → job status + + Example: + >>> async with AsyncEngine(token) as engine: + ... client = ScraperStudioAPIClient(engine) + ... response_id = await client.trigger_immediate( + ... collector="c_abc123", + ... input={"url": "https://example.com"}, + ... ) + ... data = await client.fetch_immediate_result(response_id) + """ + + def __init__(self, engine: AsyncEngine): + self.engine = engine + + async def trigger_immediate( + self, + collector: str, + input: Dict[str, Any], + ) -> str: + """ + Trigger a real-time async scrape. + + Args: + collector: Scraper collector ID (e.g., "c_abc123"). + input: Single input object (e.g., {"url": "https://..."}). + + Returns: + response_id string for polling with fetch_immediate_result(). + + Raises: + APIError: If trigger request fails. + """ + url = f"{BASE_URL}/dca/trigger_immediate" + params = {"collector": collector} + + async with self.engine.post_to_url(url, json_data=input, params=params) as response: + if response.status in (HTTP_OK, HTTP_ACCEPTED): + data = await response.json() + response_id = data.get("response_id") + if not response_id: + raise APIError(f"No response_id in trigger_immediate response: {data}") + return response_id + else: + error_text = await response.text() + raise APIError( + f"trigger_immediate failed (HTTP {response.status}): {error_text}", + status_code=response.status, + ) + + async def fetch_immediate_result( + self, + response_id: str, + ) -> List[Dict[str, Any]]: + """ + Fetch results from a real-time async scrape. + + Args: + response_id: Response ID from trigger_immediate(). + + Returns: + List of scraped records. + + Raises: + DataNotReadyError: If data is not ready yet (HTTP 202). + APIError: If fetch request fails. + """ + url = f"{BASE_URL}/dca/get_result" + params = {"response_id": response_id} + + async with self.engine.get_from_url(url, params=params) as response: + if response.status == HTTP_OK: + return await response.json() + elif response.status == 202: + raise DataNotReadyError(f"Data not ready for response_id={response_id}") + else: + error_text = await response.text() + raise APIError( + f"fetch_immediate_result failed (HTTP {response.status}): {error_text}", + status_code=response.status, + ) + + async def get_status( + self, + job_id: str, + ) -> Dict[str, Any]: + """ + Get job status/log. + + Args: + job_id: Job ID (e.g., "j_abc123"). + + Returns: + Raw API response dict (parsed into JobStatus by service layer). + + Raises: + APIError: If status request fails. + """ + url = f"{BASE_URL}/dca/log/{job_id}" + + async with self.engine.get_from_url(url) as response: + if response.status == HTTP_OK: + return await response.json() + else: + error_text = await response.text() + raise APIError( + f"get_status failed (HTTP {response.status}): {error_text}", + status_code=response.status, + ) diff --git a/src/brightdata/scraper_studio/models.py b/src/brightdata/scraper_studio/models.py new file mode 100644 index 0000000..2f4ff33 --- /dev/null +++ b/src/brightdata/scraper_studio/models.py @@ -0,0 +1,148 @@ +""" +Data models for Scraper Studio API responses. +""" + +import asyncio +import time +from dataclasses import dataclass +from typing import Dict, List, Any, Optional, TYPE_CHECKING + +from ..exceptions import DataNotReadyError + +if TYPE_CHECKING: + from .client import ScraperStudioAPIClient + + +@dataclass +class JobStatus: + """ + Job status returned by GET /dca/log/{job_id}. + + Attributes: + id: Job identifier (e.g., "j_abc123") + status: Job status ("queued", "running", "done", "failed", "cancelled") + collector: Collector ID that ran this job + inputs: Number of input records + lines: Number of records collected + fails: Number of failed records + success_rate: Success rate (0.0 to 1.0) + created: ISO timestamp when job was created + started: ISO timestamp when job started processing + finished: ISO timestamp when job finished + job_time: Total job time in milliseconds + queue_time: Time spent in queue in milliseconds + """ + + id: str + status: str + collector: str + inputs: int = 0 + lines: int = 0 + fails: int = 0 + success_rate: float = 0.0 + created: str = "" + started: Optional[str] = None + finished: Optional[str] = None + job_time: Optional[int] = None + queue_time: Optional[int] = None + + @classmethod + def from_api_response(cls, data: Dict[str, Any]) -> "JobStatus": + """ + Create from API response. + + Handles mixed-case field names from the API (e.g., "Id", "Status", "Collector"). + """ + + def _get(key: str) -> Any: + """Try lowercase, then Title_case, then original key.""" + return data.get(key.lower(), data.get(key.title(), data.get(key))) + + return cls( + id=_get("id") or "", + status=_get("status") or "unknown", + collector=_get("collector") or "", + inputs=_get("inputs") or 0, + lines=_get("lines") or 0, + fails=_get("fails") or 0, + success_rate=_get("success_rate") or data.get("Success_rate", 0.0), + created=_get("created") or "", + started=_get("started"), + finished=_get("finished"), + job_time=_get("job_time") or data.get("Job_time"), + queue_time=_get("queue_time") or data.get("Queue_time"), + ) + + +class ScraperStudioJob: + """ + A triggered Scraper Studio job. + + Returned by ScraperStudioService.trigger(). Holds the response_id + and provides convenience methods to poll and fetch results. + + Same shape as ScrapeJob but wired to Scraper Studio endpoints. + + Example: + >>> job = await client.scraper_studio.trigger( + ... collector="c_abc123", + ... input={"url": "https://example.com/1"}, + ... ) + >>> data = await job.wait_and_fetch(timeout=120) + """ + + def __init__( + self, + response_id: str, + api_client: "ScraperStudioAPIClient", + ): + self.response_id = response_id + self._api_client = api_client + self._cached_data: Optional[List[Dict[str, Any]]] = None + + def __repr__(self) -> str: + return f"" + + async def fetch(self) -> List[Dict[str, Any]]: + """ + Fetch results via GET /dca/get_result. + + Returns: + List of scraped records. + + Raises: + DataNotReadyError: If data is not ready yet (HTTP 202). + """ + self._cached_data = await self._api_client.fetch_immediate_result(self.response_id) + return self._cached_data + + async def wait_and_fetch( + self, + timeout: int = 300, + poll_interval: int = 10, + ) -> List[Dict[str, Any]]: + """ + Poll fetch() until data arrives or timeout. + + Args: + timeout: Maximum seconds to wait. + poll_interval: Seconds between poll attempts. + + Returns: + List of scraped records. + + Raises: + TimeoutError: If timeout is reached before data is ready. + """ + start_time = time.time() + + while True: + elapsed = time.time() - start_time + + if elapsed > timeout: + raise TimeoutError(f"Job {self.response_id} timed out after {timeout}s") + + try: + return await self.fetch() + except DataNotReadyError: + await asyncio.sleep(poll_interval) diff --git a/src/brightdata/sync_client.py b/src/brightdata/sync_client.py index fb2d6be..4c57e67 100644 --- a/src/brightdata/sync_client.py +++ b/src/brightdata/sync_client.py @@ -8,6 +8,7 @@ from typing import Optional, List, Dict, Any from .client import BrightDataClient +from .api.browser_service import BrowserService from .models import ScrapeResult, SearchResult from .types import AccountInfo @@ -34,7 +35,10 @@ def __init__( timeout: int = 30, web_unlocker_zone: Optional[str] = None, serp_zone: Optional[str] = None, - browser_zone: Optional[str] = None, + browser_username: Optional[str] = None, + browser_password: Optional[str] = None, + browser_host: Optional[str] = None, + browser_port: Optional[int] = None, auto_create_zones: bool = True, validate_token: bool = False, rate_limit: Optional[float] = None, @@ -48,7 +52,10 @@ def __init__( timeout: Default request timeout in seconds web_unlocker_zone: Zone name for Web Unlocker API serp_zone: Zone name for SERP API - browser_zone: Zone name for Browser API + browser_username: Browser API username (or set BRIGHTDATA_BROWSERAPI_USERNAME env var) + browser_password: Browser API password (or set BRIGHTDATA_BROWSERAPI_PASSWORD env var) + browser_host: Browser API host (default: "brd.superproxy.io") + browser_port: Browser API port (default: 9222) auto_create_zones: Automatically create required zones if missing validate_token: Validate token on initialization rate_limit: Rate limit (requests per period) @@ -73,7 +80,10 @@ def __init__( timeout=timeout, web_unlocker_zone=web_unlocker_zone, serp_zone=serp_zone, - browser_zone=browser_zone, + browser_username=browser_username, + browser_password=browser_password, + browser_host=browser_host, + browser_port=browser_port, auto_create_zones=auto_create_zones, validate_token=False, # Will validate during __enter__ rate_limit=rate_limit, @@ -84,6 +94,7 @@ def __init__( self._scrape: Optional["SyncScrapeService"] = None self._search: Optional["SyncSearchService"] = None self._crawler: Optional["SyncCrawlerService"] = None + self._scraper_studio: Optional["SyncScraperStudioService"] = None def __enter__(self): """Initialize persistent event loop and async client.""" @@ -174,6 +185,11 @@ def scrape_url(self, url, **kwargs): # Service Properties # ======================================== + @property + def browser(self) -> BrowserService: + """Access Browser API service (builds CDP WebSocket URLs).""" + return self._async_client.browser + @property def scrape(self) -> "SyncScrapeService": """Access scraping services (sync).""" @@ -195,6 +211,15 @@ def crawler(self) -> "SyncCrawlerService": self._crawler = SyncCrawlerService(self._async_client.crawler, self._loop) return self._crawler + @property + def scraper_studio(self) -> "SyncScraperStudioService": + """Access Scraper Studio services (sync).""" + if self._scraper_studio is None: + self._scraper_studio = SyncScraperStudioService( + self._async_client.scraper_studio, self._loop + ) + return self._scraper_studio + @property def token(self) -> str: """Get API token.""" @@ -670,3 +695,34 @@ def crawl(self, url, **kwargs): def scrape(self, url, **kwargs): """Scrape a URL.""" return self._loop.run_until_complete(self._async.scrape(url, **kwargs)) + + +# ============================================================================ +# SYNC SCRAPER STUDIO SERVICE +# ============================================================================ + + +class SyncScraperStudioService: + """Sync wrapper for ScraperStudioService.""" + + def __init__(self, async_service, loop): + self._async = async_service + self._loop = loop + + def run(self, collector, input, timeout=180, poll_interval=10): + """Trigger scrape and wait for results.""" + return self._loop.run_until_complete( + self._async.run(collector, input, timeout=timeout, poll_interval=poll_interval) + ) + + def trigger(self, collector, input): + """Trigger scrape, return job object.""" + return self._loop.run_until_complete(self._async.trigger(collector, input)) + + def status(self, job_id): + """Check job status.""" + return self._loop.run_until_complete(self._async.status(job_id)) + + def fetch(self, response_id): + """Fetch results.""" + return self._loop.run_until_complete(self._async.fetch(response_id)) diff --git a/tests/enes/chatgpt_02.py b/tests/enes/chatgpt_02.py index e7606ca..476b8c3 100644 --- a/tests/enes/chatgpt_02.py +++ b/tests/enes/chatgpt_02.py @@ -218,7 +218,8 @@ async def test_chatgpt(): print("\n" + "=" * 60) print("SUMMARY:") print("-" * 40) - print(""" + print( + """ ChatGPT Scraper Configuration: - Dataset ID: gd_m7aof0k82r803d5bjm - Platform: chatgpt @@ -236,7 +237,8 @@ async def test_chatgpt(): 1. Check API token is valid 2. Verify account has ChatGPT access enabled 3. Check account balance for ChatGPT operations -""") +""" + ) if __name__ == "__main__": diff --git a/tests/enes/serp.py b/tests/enes/serp.py index 2692121..4226e05 100644 --- a/tests/enes/serp.py +++ b/tests/enes/serp.py @@ -107,7 +107,8 @@ def capture_raw(data): print("\n" + "=" * 60) print("SUMMARY:") print("-" * 40) - print(""" + print( + """ The SERP API returns raw HTML but the SDK expects parsed JSON. This is why all SERP searches return 0 results. @@ -115,7 +116,8 @@ def capture_raw(data): 1. The SERP zone needs to return parsed data (not raw HTML) 2. The SDK needs an HTML parser (BeautifulSoup, etc.) 3. A different Bright Data service/endpoint should be used -""") +""" + ) if __name__ == "__main__": diff --git a/tests/enes/zones/cache_fix.py b/tests/enes/zones/cache_fix.py index e3dbe91..2ae00ed 100644 --- a/tests/enes/zones/cache_fix.py +++ b/tests/enes/zones/cache_fix.py @@ -66,19 +66,21 @@ async def demo_caching(): print("\n\n" + "=" * 70) print("📝 RECOMMENDATIONS:") print("=" * 70) - print(""" + print( + """ ✅ For listing zones after creation/deletion: Use: await client.list_zones() - + ✅ For general account info (cached): Use: await client.get_account_info() - + ✅ For fresh account info (after zone changes): Use: await client.get_account_info(refresh=True) - + ⚠️ AVOID: Using get_account_info()['zones'] without refresh This returns cached data that may be stale! - """) + """ + ) print("=" * 70) # Show some zones diff --git a/tests/enes/zones/dash_sync.py b/tests/enes/zones/dash_sync.py index 159bef8..9fbaa2b 100644 --- a/tests/enes/zones/dash_sync.py +++ b/tests/enes/zones/dash_sync.py @@ -61,7 +61,8 @@ async def verify_dashboard_sync(): print("\n" + "=" * 70) print("✅ VERIFICATION COMPLETE") print("=" * 70) - print(""" + print( + """ These zones should match exactly what you see in your dashboard at: https://brightdata.com/cp/zones @@ -70,10 +71,11 @@ async def verify_dashboard_sync(): 2. Count the total zones shown 3. Compare with the count above 4. Check that zone names and types match - + ✅ If they match: SDK and dashboard are in sync! ❌ If they don't: There may be a caching or API delay issue - """) + """ + ) return True diff --git a/tests/enes/zones/permission.py b/tests/enes/zones/permission.py index 84d2e04..8046d29 100644 --- a/tests/enes/zones/permission.py +++ b/tests/enes/zones/permission.py @@ -24,7 +24,8 @@ async def test_permission_error_handling(): print("🧪 TESTING PERMISSION ERROR HANDLING") print("=" * 70) - print(""" + print( + """ This test demonstrates the improved error handling when your API token lacks zone creation permissions. @@ -33,7 +34,8 @@ async def test_permission_error_handling(): ✅ Direct link to fix the problem ✅ No silent failures ✅ Helpful instructions for users - """) + """ + ) if not os.environ.get("BRIGHTDATA_API_TOKEN"): print("\n❌ ERROR: No API token found") @@ -74,7 +76,8 @@ async def test_permission_error_handling(): print("\n" + "=" * 70) print("📝 This is the IMPROVED error handling!") print("=" * 70) - print(""" + print( + """ Before: Error was unclear and could fail silently After: Clear message with actionable steps to fix the issue @@ -82,7 +85,8 @@ async def test_permission_error_handling(): 1. ❌ What went wrong (permission denied) 2. 🔗 Where to fix it (https://brightdata.com/cp/setting/users) 3. 📋 What to do (enable zone creation permission) - """) + """ + ) return True # This is expected behavior except Exception as e: @@ -101,13 +105,15 @@ async def test_permission_error_handling(): if success: print("✅ TEST PASSED") print("=" * 70) - print(""" + print( + """ Summary: • Permission errors are now caught and displayed clearly • Users get actionable instructions to fix the problem • No more silent failures • SDK provides helpful guidance - """) + """ + ) else: print("❌ TEST FAILED") print("=" * 70) diff --git a/tests/unit/test_client.py b/tests/unit/test_client.py index 6445f6a..771f834 100644 --- a/tests/unit/test_client.py +++ b/tests/unit/test_client.py @@ -18,7 +18,6 @@ def test_client_with_explicit_token(self): assert client.timeout == 30 # Default timeout assert client.web_unlocker_zone == "sdk_unlocker" assert client.serp_zone == "sdk_serp" - assert client.browser_zone == "sdk_browser" def test_client_with_custom_config(self): """Test client with custom configuration.""" @@ -27,13 +26,11 @@ def test_client_with_custom_config(self): timeout=60, web_unlocker_zone="my_unlocker", serp_zone="my_serp", - browser_zone="my_browser", ) assert client.timeout == 60 assert client.web_unlocker_zone == "my_unlocker" assert client.serp_zone == "my_serp" - assert client.browser_zone == "my_browser" def test_client_loads_from_brightdata_api_token(self): """Test client loads token from BRIGHTDATA_API_TOKEN.""" diff --git a/tests/unit/test_zone_manager.py b/tests/unit/test_zone_manager.py index 04e48c9..7185d65 100644 --- a/tests/unit/test_zone_manager.py +++ b/tests/unit/test_zone_manager.py @@ -290,7 +290,6 @@ async def test_ensure_zones_with_browser(self, mock_engine): await zone_manager.ensure_required_zones( web_unlocker_zone="sdk_unlocker", serp_zone="sdk_serp", - browser_zone="sdk_browser", # This is passed but NOT created (by design) ) # Should only create unblocker + SERP zones (browser zones require manual setup) From 447c0a028aa47614f9162f9dce3400af9ea3ff0f Mon Sep 17 00:00:00 2001 From: "user.mail" Date: Mon, 9 Mar 2026 05:44:15 +0300 Subject: [PATCH 2/2] v2.3.0: restructure codebase, rewrite test suite, add DigiKey and Reddit scrapers - Reorganize modules from flat api/ into domain directories (browser/, crawler/, serp/, web_unlocker/, scraper_studio/) - Remove dead code: protocols.py, config.py, auto.py, core/auth, core/hooks, core/logging, utils/parsing, utils/timing, _internal/ - Collapse datasets/client.py from 1635 to 285 lines - Fix ScrapeJob.to_result() crash (wrong kwargs to ScrapeResult) - Add DigiKey and Reddit platform scrapers - Rewrite test suite: 365 unit tests with shared fixtures via conftest.py - Delete stale benchmarks, docs, examples, requirements files --- .gitignore | 2 +- CHANGELOG.md | 4 +- MANIFEST.in | 6 - benchmarks/bench_async_vs_sync.py | 1 - benchmarks/bench_batch_operations.py | 1 - benchmarks/bench_memory_usage.py | 1 - docs/api-reference/.gitkeep | 0 docs/async_mode_guide.md | 581 ------ docs/guides/.gitkeep | 0 docs/sync_client.md | 127 -- examples/01_simple_scrape.py | 1 - examples/02_async_scrape.py | 1 - examples/03_batch_scraping.py | 1 - examples/04_specialized_scrapers.py | 1 - examples/05_browser_automation.py | 1 - examples/06_web_crawling.py | 1 - examples/07_advanced_usage.py | 1 - examples/08_result_models.py | 168 -- examples/09_result_models_demo.py | 105 - ...s_integration.py => pandas_integration.py} | 0 ...gger_interface.py => trigger_interface.py} | 0 notebooks/test_v2.1.0_release.ipynb | 4 +- pyproject.toml | 3 +- requirements-dev.txt | 9 - requirements.txt | 7 - src/brightdata/__init__.py | 8 +- src/brightdata/_internal/__init__.py | 1 - src/brightdata/_internal/compat.py | 1 - src/brightdata/api/__init__.py | 1 - src/brightdata/api/crawl.py | 1 - src/brightdata/api/datasets.py | 1 - src/brightdata/api/download.py | 1 - src/brightdata/auto.py | 1 - src/brightdata/browser/__init__.py | 5 + .../browser_service.py => browser/service.py} | 0 src/brightdata/client.py | 24 +- src/brightdata/config.py | 1 - src/brightdata/constants.py | 25 - src/brightdata/core/auth.py | 1 - src/brightdata/core/engine.py | 24 +- src/brightdata/core/hooks.py | 1 - src/brightdata/core/logging.py | 1 - src/brightdata/core/zone_manager.py | 38 +- src/brightdata/crawler/__init__.py | 5 + .../crawler_service.py => crawler/service.py} | 0 src/brightdata/datasets/agoda/properties.py | 2 +- src/brightdata/datasets/airbnb/properties.py | 2 +- .../datasets/amazon/best_sellers.py | 2 +- src/brightdata/datasets/amazon/products.py | 2 +- .../datasets/amazon/products_global.py | 2 +- .../datasets/amazon/products_search.py | 2 +- src/brightdata/datasets/amazon/reviews.py | 2 +- src/brightdata/datasets/amazon/sellers.py | 2 +- src/brightdata/datasets/amazon/walmart.py | 2 +- .../datasets/american_eagle/products.py | 2 +- .../datasets/apple_appstore/reviews.py | 2 +- .../datasets/apple_appstore/store.py | 2 +- .../datasets/ashley_furniture/products.py | 2 +- src/brightdata/datasets/asos/products.py | 2 +- src/brightdata/datasets/autozone/products.py | 2 +- .../datasets/balenciaga/products.py | 2 +- src/brightdata/datasets/base.py | 2 +- src/brightdata/datasets/bbc/news.py | 2 +- src/brightdata/datasets/berluti/products.py | 2 +- src/brightdata/datasets/bestbuy/products.py | 2 +- src/brightdata/datasets/bh/products.py | 2 +- src/brightdata/datasets/bluesky/posts.py | 2 +- .../datasets/bluesky/top_profiles.py | 2 +- .../datasets/booking/hotel_listings.py | 2 +- .../datasets/booking/listings_search.py | 2 +- .../datasets/bottegaveneta/products.py | 2 +- src/brightdata/datasets/carsales/listings.py | 2 +- src/brightdata/datasets/carters/products.py | 2 +- src/brightdata/datasets/celine/products.py | 2 +- src/brightdata/datasets/chanel/products.py | 2 +- src/brightdata/datasets/chileautos/cars.py | 2 +- src/brightdata/datasets/client.py | 1802 +++-------------- src/brightdata/datasets/client.pyi | 1111 ++++++++++ src/brightdata/datasets/cnn/news.py | 2 +- .../datasets/companies_enriched/companies.py | 2 +- src/brightdata/datasets/costco/products.py | 2 +- .../datasets/crateandbarrel/products.py | 2 +- .../datasets/creative_commons/images.py | 2 +- .../datasets/creative_commons/models_3d.py | 2 +- .../datasets/crunchbase/companies.py | 2 +- src/brightdata/datasets/delvaux/products.py | 2 +- src/brightdata/datasets/digikey/products.py | 2 +- src/brightdata/datasets/dior/products.py | 2 +- src/brightdata/datasets/ebay/products.py | 2 +- .../datasets/employees_enriched/employees.py | 2 +- src/brightdata/datasets/etsy/products.py | 2 +- src/brightdata/datasets/facebook/comments.py | 2 +- .../datasets/facebook/company_reviews.py | 2 +- src/brightdata/datasets/facebook/events.py | 2 +- .../datasets/facebook/group_posts.py | 2 +- .../datasets/facebook/marketplace.py | 2 +- .../datasets/facebook/pages_posts.py | 2 +- .../datasets/facebook/pages_profiles.py | 2 +- .../datasets/facebook/posts_by_url.py | 2 +- src/brightdata/datasets/facebook/profiles.py | 2 +- src/brightdata/datasets/facebook/reels.py | 2 +- src/brightdata/datasets/fanatics/products.py | 2 +- src/brightdata/datasets/fendi/products.py | 2 +- src/brightdata/datasets/g2/products.py | 2 +- src/brightdata/datasets/g2/reviews.py | 2 +- .../datasets/github/repositories.py | 2 +- .../datasets/glassdoor/companies.py | 2 +- src/brightdata/datasets/glassdoor/jobs.py | 2 +- src/brightdata/datasets/glassdoor/reviews.py | 2 +- src/brightdata/datasets/goodreads/books.py | 2 +- .../datasets/google_maps/full_info.py | 2 +- .../datasets/google_maps/reviews.py | 2 +- src/brightdata/datasets/google_news/news.py | 2 +- .../datasets/google_play/reviews.py | 2 +- src/brightdata/datasets/google_play/store.py | 2 +- .../datasets/google_shopping/products.py | 2 +- .../datasets/google_shopping/search_us.py | 2 +- src/brightdata/datasets/hermes/products.py | 2 +- src/brightdata/datasets/hm/products.py | 2 +- .../datasets/homedepot/products_ca.py | 2 +- .../datasets/homedepot/products_us.py | 2 +- src/brightdata/datasets/ikea/products.py | 2 +- src/brightdata/datasets/imdb/movies.py | 2 +- src/brightdata/datasets/indeed/companies.py | 2 +- src/brightdata/datasets/indeed/jobs.py | 2 +- .../datasets/infocasas/properties.py | 2 +- .../datasets/inmuebles24/properties.py | 2 +- src/brightdata/datasets/instagram/comments.py | 2 +- src/brightdata/datasets/instagram/posts.py | 2 +- src/brightdata/datasets/instagram/profiles.py | 2 +- src/brightdata/datasets/instagram/reels.py | 2 +- src/brightdata/datasets/kroger/products.py | 2 +- src/brightdata/datasets/lawyers/us_lawyers.py | 2 +- src/brightdata/datasets/lazada/products.py | 2 +- .../datasets/lazada/products_search.py | 2 +- src/brightdata/datasets/lazada/reviews.py | 2 +- src/brightdata/datasets/lazboy/products.py | 2 +- src/brightdata/datasets/lego/products.py | 2 +- .../datasets/linkedin/company_profiles.py | 2 +- .../datasets/linkedin/job_listings.py | 2 +- .../datasets/linkedin/people_profiles.py | 2 +- src/brightdata/datasets/linkedin/posts.py | 2 +- .../linkedin/profiles_job_listings.py | 2 +- src/brightdata/datasets/llbean/products.py | 2 +- src/brightdata/datasets/loewe/products.py | 2 +- src/brightdata/datasets/lowes/products.py | 2 +- src/brightdata/datasets/macys/products.py | 2 +- src/brightdata/datasets/mango/products.py | 2 +- src/brightdata/datasets/manta/businesses.py | 2 +- .../datasets/massimo_dutti/products.py | 2 +- .../datasets/mattressfirm/products.py | 2 +- .../datasets/mediamarkt/products.py | 2 +- .../datasets/mercadolivre/products.py | 2 +- .../datasets/metrocuadrado/properties.py | 2 +- .../datasets/microcenter/products.py | 2 +- src/brightdata/datasets/montblanc/products.py | 2 +- src/brightdata/datasets/mouser/products.py | 2 +- src/brightdata/datasets/moynat/products.py | 2 +- src/brightdata/datasets/mybobs/products.py | 2 +- src/brightdata/datasets/myntra/products.py | 2 +- src/brightdata/datasets/naver/products.py | 2 +- src/brightdata/datasets/nba/players_stats.py | 2 +- src/brightdata/datasets/olx/ads.py | 2 +- src/brightdata/datasets/otodom/properties.py | 2 +- src/brightdata/datasets/owler/companies.py | 2 +- src/brightdata/datasets/ozon/products.py | 2 +- src/brightdata/datasets/pinterest/posts.py | 2 +- src/brightdata/datasets/pinterest/profiles.py | 2 +- .../datasets/pitchbook/companies.py | 2 +- src/brightdata/datasets/prada/products.py | 2 +- .../datasets/properati/properties.py | 2 +- src/brightdata/datasets/quora/posts.py | 2 +- .../datasets/raymourflanigan/products.py | 2 +- .../datasets/real_estate/australia.py | 2 +- .../realtor/international_properties.py | 2 +- src/brightdata/datasets/reddit/comments.py | 2 +- src/brightdata/datasets/reddit/posts.py | 2 +- src/brightdata/datasets/rona/products.py | 2 +- src/brightdata/datasets/sephora/products.py | 2 +- src/brightdata/datasets/shein/products.py | 2 +- src/brightdata/datasets/shopee/products.py | 2 +- .../datasets/sleepnumber/products.py | 2 +- src/brightdata/datasets/slintel/companies.py | 2 +- src/brightdata/datasets/snapchat/posts.py | 2 +- src/brightdata/datasets/tiktok/comments.py | 2 +- src/brightdata/datasets/tiktok/posts.py | 2 +- src/brightdata/datasets/tiktok/profiles.py | 2 +- src/brightdata/datasets/tiktok/shop.py | 2 +- src/brightdata/datasets/toctoc/properties.py | 2 +- src/brightdata/datasets/tokopedia/products.py | 2 +- src/brightdata/datasets/toysrus/products.py | 2 +- src/brightdata/datasets/trustpilot/reviews.py | 2 +- .../datasets/trustradius/reviews.py | 2 +- .../datasets/ventureradar/companies.py | 2 +- src/brightdata/datasets/vimeo/videos.py | 2 +- src/brightdata/datasets/walmart/products.py | 2 +- src/brightdata/datasets/walmart/sellers.py | 2 +- src/brightdata/datasets/wayfair/products.py | 2 +- src/brightdata/datasets/webmotors/vehicles.py | 2 +- src/brightdata/datasets/wikipedia/articles.py | 2 +- .../datasets/wildberries/products.py | 2 +- .../datasets/world_population/countries.py | 2 +- .../datasets/world_zipcodes/zipcodes.py | 2 +- src/brightdata/datasets/x_twitter/posts.py | 2 +- src/brightdata/datasets/x_twitter/profiles.py | 2 +- src/brightdata/datasets/xing/profiles.py | 2 +- .../datasets/yahoo_finance/businesses.py | 2 +- src/brightdata/datasets/yapo/ads.py | 2 +- src/brightdata/datasets/yelp/businesses.py | 2 +- src/brightdata/datasets/yelp/reviews.py | 2 +- src/brightdata/datasets/youtube/comments.py | 2 +- src/brightdata/datasets/youtube/profiles.py | 2 +- src/brightdata/datasets/youtube/videos.py | 2 +- src/brightdata/datasets/ysl/products.py | 2 +- src/brightdata/datasets/zalando/products.py | 2 +- src/brightdata/datasets/zara/home_products.py | 2 +- src/brightdata/datasets/zara/products.py | 2 +- .../datasets/zillow/price_history.py | 2 +- src/brightdata/datasets/zillow/properties.py | 2 +- .../datasets/zonaprop/properties.py | 2 +- src/brightdata/datasets/zoominfo/companies.py | 2 +- src/brightdata/datasets/zoopla/properties.py | 2 +- src/brightdata/exceptions/__init__.py | 2 - src/brightdata/exceptions/errors.py | 6 - src/brightdata/protocols.py | 1 - src/brightdata/scraper_studio/__init__.py | 3 +- src/brightdata/scraper_studio/client.py | 8 +- .../service.py} | 4 +- src/brightdata/scrapers/__init__.py | 12 + src/brightdata/scrapers/api_client.py | 8 +- src/brightdata/scrapers/digikey/__init__.py | 3 + src/brightdata/scrapers/digikey/scraper.py | 252 +++ src/brightdata/scrapers/job.py | 12 +- src/brightdata/scrapers/reddit/__init__.py | 3 + src/brightdata/scrapers/reddit/scraper.py | 504 +++++ .../scrape_service.py => scrapers/service.py} | 78 +- src/brightdata/{api => }/serp/__init__.py | 2 + src/brightdata/{api => }/serp/base.py | 18 +- src/brightdata/{api => }/serp/bing.py | 2 +- .../{api => }/serp/data_normalizer.py | 2 +- src/brightdata/{api => }/serp/google.py | 2 +- .../search_service.py => serp/service.py} | 12 +- src/brightdata/{api => }/serp/url_builder.py | 2 +- src/brightdata/{api => }/serp/yandex.py | 2 +- src/brightdata/sync_client.py | 27 +- src/brightdata/types.py | 276 +-- src/brightdata/utils/parsing.py | 1 - src/brightdata/utils/retry.py | 2 +- src/brightdata/utils/timing.py | 1 - src/brightdata/utils/url.py | 17 - src/brightdata/utils/validation.py | 17 - src/brightdata/web_unlocker/__init__.py | 5 + .../async_client.py} | 0 src/brightdata/{api => web_unlocker}/base.py | 0 .../service.py} | 6 +- tests/__init__.py | 1 - tests/conftest.py | 146 +- tests/e2e/__init__.py | 1 - tests/e2e/test_async_operations.py | 1 - tests/e2e/test_batch_scrape.py | 1 - tests/e2e/test_client_e2e.py | 315 --- tests/e2e/test_simple_scrape.py | 1 - tests/enes/amazon.py | 132 -- tests/enes/amazon_search.py | 172 -- tests/enes/chatgpt.py | 190 -- tests/enes/chatgpt_02.py | 245 --- tests/enes/facebook.py | 299 --- tests/enes/get_dataset_metadata.py | 74 - tests/enes/get_datasets.py | 82 - tests/enes/instagram.py | 212 -- tests/enes/linkedin.py | 214 -- tests/enes/serp.py | 124 -- tests/enes/web_unlocker.py | 233 --- tests/enes/zones/auto_zone.py | 218 -- tests/enes/zones/auto_zones.py | 214 -- tests/enes/zones/cache_fix.py | 104 - tests/enes/zones/clean_zones.py | 154 -- tests/enes/zones/crud_zones.py | 300 --- tests/enes/zones/dash_sync.py | 96 - tests/enes/zones/delete_zone.py | 154 -- tests/enes/zones/list_zones.py | 261 --- tests/enes/zones/permission.py | 125 -- tests/enes/zones/test_cache.py | 95 - tests/fixtures/.gitkeep | 0 tests/fixtures/mock_data/.gitkeep | 0 tests/fixtures/responses/.gitkeep | 0 tests/integration/__init__.py | 1 - tests/integration/test_browser_api.py | 1 - tests/integration/test_client_integration.py | 220 -- tests/integration/test_crawl_api.py | 1 - tests/integration/test_serp_api.py | 1 - tests/integration/test_serp_async_mode.py | 231 --- tests/integration/test_web_unlocker_api.py | 1 - .../test_web_unlocker_async_mode.py | 254 --- tests/readme.py | 1044 ---------- tests/run_all.py | 185 -- tests/samples/amazon/product.json | 648 ------ tests/samples/amazon/reviews.json | 137 -- tests/samples/chatgpt/prompt.json | 35 - tests/samples/facebook/posts.json | 537 ----- tests/samples/instagram/profile.json | 228 --- tests/samples/linkedin/profile.json | 407 ---- tests/samples/serp/google.json | 23 - .../web_unlocker/country_targeting.html | 17 - .../samples/web_unlocker/multiple_urls_1.html | 14 - .../samples/web_unlocker/multiple_urls_2.html | 24 - .../samples/web_unlocker/multiple_urls_3.html | 1 - .../samples/web_unlocker/single_url_json.json | 13 - .../samples/web_unlocker/single_url_raw.html | 14 - tests/test_cli.sh | 175 -- tests/unit/__init__.py | 1 - tests/unit/test_amazon.py | 314 --- tests/unit/test_async_unblocker.py | 176 +- tests/unit/test_batch.py | 172 -- tests/unit/test_chatgpt.py | 265 --- tests/unit/test_client.py | 555 +++-- tests/unit/test_constants.py | 274 --- tests/unit/test_engine.py | 315 ++- tests/unit/test_engine_sharing.py | 217 -- tests/unit/test_facebook.py | 262 --- tests/unit/test_function_detection.py | 251 --- tests/unit/test_instagram.py | 390 ---- tests/unit/test_linkedin.py | 535 ----- tests/unit/test_models.py | 241 +-- tests/unit/test_payloads.py | 211 +- tests/unit/test_retry.py | 181 +- tests/unit/test_scrapers.py | 476 ----- tests/unit/test_serp.py | 507 ----- tests/unit/test_ssl_helpers.py | 238 +-- tests/unit/test_validation.py | 1 - tests/unit/test_zone_manager.py | 358 ++-- 331 files changed, 3956 insertions(+), 16079 deletions(-) delete mode 100644 MANIFEST.in delete mode 100644 benchmarks/bench_async_vs_sync.py delete mode 100644 benchmarks/bench_batch_operations.py delete mode 100644 benchmarks/bench_memory_usage.py delete mode 100644 docs/api-reference/.gitkeep delete mode 100644 docs/async_mode_guide.md delete mode 100644 docs/guides/.gitkeep delete mode 100644 docs/sync_client.md delete mode 100644 examples/01_simple_scrape.py delete mode 100644 examples/02_async_scrape.py delete mode 100644 examples/03_batch_scraping.py delete mode 100644 examples/04_specialized_scrapers.py delete mode 100644 examples/05_browser_automation.py delete mode 100644 examples/06_web_crawling.py delete mode 100644 examples/07_advanced_usage.py delete mode 100644 examples/08_result_models.py delete mode 100644 examples/09_result_models_demo.py rename examples/{10_pandas_integration.py => pandas_integration.py} (100%) rename examples/{11_trigger_interface.py => trigger_interface.py} (100%) delete mode 100644 requirements-dev.txt delete mode 100644 requirements.txt delete mode 100644 src/brightdata/_internal/__init__.py delete mode 100644 src/brightdata/_internal/compat.py delete mode 100644 src/brightdata/api/__init__.py delete mode 100644 src/brightdata/api/crawl.py delete mode 100644 src/brightdata/api/datasets.py delete mode 100644 src/brightdata/api/download.py delete mode 100644 src/brightdata/auto.py create mode 100644 src/brightdata/browser/__init__.py rename src/brightdata/{api/browser_service.py => browser/service.py} (100%) delete mode 100644 src/brightdata/config.py delete mode 100644 src/brightdata/core/auth.py delete mode 100644 src/brightdata/core/hooks.py delete mode 100644 src/brightdata/core/logging.py create mode 100644 src/brightdata/crawler/__init__.py rename src/brightdata/{api/crawler_service.py => crawler/service.py} (100%) create mode 100644 src/brightdata/datasets/client.pyi delete mode 100644 src/brightdata/protocols.py rename src/brightdata/{api/scraper_studio_service.py => scraper_studio/service.py} (97%) create mode 100644 src/brightdata/scrapers/digikey/__init__.py create mode 100644 src/brightdata/scrapers/digikey/scraper.py create mode 100644 src/brightdata/scrapers/reddit/__init__.py create mode 100644 src/brightdata/scrapers/reddit/scraper.py rename src/brightdata/{api/scrape_service.py => scrapers/service.py} (78%) rename src/brightdata/{api => }/serp/__init__.py (83%) rename src/brightdata/{api => }/serp/base.py (98%) rename src/brightdata/{api => }/serp/bing.py (95%) rename src/brightdata/{api => }/serp/data_normalizer.py (99%) rename src/brightdata/{api => }/serp/google.py (96%) rename src/brightdata/{api/search_service.py => serp/service.py} (97%) rename src/brightdata/{api => }/serp/url_builder.py (98%) rename src/brightdata/{api => }/serp/yandex.py (95%) delete mode 100644 src/brightdata/utils/parsing.py delete mode 100644 src/brightdata/utils/timing.py create mode 100644 src/brightdata/web_unlocker/__init__.py rename src/brightdata/{api/async_unblocker.py => web_unlocker/async_client.py} (100%) rename src/brightdata/{api => web_unlocker}/base.py (100%) rename src/brightdata/{api/web_unlocker.py => web_unlocker/service.py} (99%) delete mode 100644 tests/e2e/__init__.py delete mode 100644 tests/e2e/test_async_operations.py delete mode 100644 tests/e2e/test_batch_scrape.py delete mode 100644 tests/e2e/test_client_e2e.py delete mode 100644 tests/e2e/test_simple_scrape.py delete mode 100644 tests/enes/amazon.py delete mode 100644 tests/enes/amazon_search.py delete mode 100644 tests/enes/chatgpt.py delete mode 100644 tests/enes/chatgpt_02.py delete mode 100644 tests/enes/facebook.py delete mode 100644 tests/enes/get_dataset_metadata.py delete mode 100644 tests/enes/get_datasets.py delete mode 100644 tests/enes/instagram.py delete mode 100644 tests/enes/linkedin.py delete mode 100644 tests/enes/serp.py delete mode 100644 tests/enes/web_unlocker.py delete mode 100644 tests/enes/zones/auto_zone.py delete mode 100644 tests/enes/zones/auto_zones.py delete mode 100644 tests/enes/zones/cache_fix.py delete mode 100644 tests/enes/zones/clean_zones.py delete mode 100644 tests/enes/zones/crud_zones.py delete mode 100644 tests/enes/zones/dash_sync.py delete mode 100644 tests/enes/zones/delete_zone.py delete mode 100644 tests/enes/zones/list_zones.py delete mode 100644 tests/enes/zones/permission.py delete mode 100644 tests/enes/zones/test_cache.py delete mode 100644 tests/fixtures/.gitkeep delete mode 100644 tests/fixtures/mock_data/.gitkeep delete mode 100644 tests/fixtures/responses/.gitkeep delete mode 100644 tests/integration/test_browser_api.py delete mode 100644 tests/integration/test_client_integration.py delete mode 100644 tests/integration/test_crawl_api.py delete mode 100644 tests/integration/test_serp_api.py delete mode 100644 tests/integration/test_serp_async_mode.py delete mode 100644 tests/integration/test_web_unlocker_api.py delete mode 100644 tests/integration/test_web_unlocker_async_mode.py delete mode 100644 tests/readme.py delete mode 100644 tests/run_all.py delete mode 100644 tests/samples/amazon/product.json delete mode 100644 tests/samples/amazon/reviews.json delete mode 100644 tests/samples/chatgpt/prompt.json delete mode 100644 tests/samples/facebook/posts.json delete mode 100644 tests/samples/instagram/profile.json delete mode 100644 tests/samples/linkedin/profile.json delete mode 100644 tests/samples/serp/google.json delete mode 100644 tests/samples/web_unlocker/country_targeting.html delete mode 100644 tests/samples/web_unlocker/multiple_urls_1.html delete mode 100644 tests/samples/web_unlocker/multiple_urls_2.html delete mode 100644 tests/samples/web_unlocker/multiple_urls_3.html delete mode 100644 tests/samples/web_unlocker/single_url_json.json delete mode 100644 tests/samples/web_unlocker/single_url_raw.html delete mode 100755 tests/test_cli.sh delete mode 100644 tests/unit/test_amazon.py delete mode 100644 tests/unit/test_batch.py delete mode 100644 tests/unit/test_chatgpt.py delete mode 100644 tests/unit/test_constants.py delete mode 100644 tests/unit/test_engine_sharing.py delete mode 100644 tests/unit/test_facebook.py delete mode 100644 tests/unit/test_function_detection.py delete mode 100644 tests/unit/test_instagram.py delete mode 100644 tests/unit/test_linkedin.py delete mode 100644 tests/unit/test_scrapers.py delete mode 100644 tests/unit/test_serp.py delete mode 100644 tests/unit/test_validation.py diff --git a/.gitignore b/.gitignore index d255b80..1b4d081 100644 --- a/.gitignore +++ b/.gitignore @@ -11,7 +11,7 @@ __pycache__/ *.so -.devodcs +.devdocs # Distribution / packaging .Python diff --git a/CHANGELOG.md b/CHANGELOG.md index e7b4f34..54007f8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,10 +1,12 @@ # Bright Data Python SDK Changelog -## Version 2.2.2 - Browser API, Scraper Studio, 175 Datasets +## Version 2.3.0 - Browser API, Scraper Studio, 175 Datasets - **Browser API**: Connect to cloud Chrome via CDP WebSocket. SDK builds the `wss://` URL, you connect with Playwright/Puppeteer (`client.browser.get_connect_url()`) - **Scraper Studio**: Trigger and fetch results from custom scrapers built in Bright Data's IDE (`client.scraper_studio.run()`) - **75 more datasets**: Agoda, AutoZone, BBC, Best Buy, Bluesky, Booking, Costco, eBay, Etsy, GitHub, Google News/Play/Shopping, Home Depot, Kroger, Lowe's, Macy's, Microcenter, Ozon, Quora, Realtor, Reddit, Snapchat, TikTok Shop, Tokopedia, Vimeo, Wayfair, Wikipedia, Wildberries, X/Twitter, Yahoo Finance, Zoopla, and more — **175 total** +- **Codebase cleanup**: Removed dead code and legacy abstractions — collapsed `datasets/client.py` from 1635 to 285 lines, fixed `ScrapeJob.to_result()` crash bug, cleaned up unused protocols, redundant config layers, and stale API modules +- **Test suite rewrite**: Rebuilt test suite from scratch with 365 unit tests, shared fixtures via `conftest.py`, behavioral coverage focus — key modules now at 87–98% coverage (client, scrapers, SERP, sync client, job lifecycle) --- diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index 63958da..0000000 --- a/MANIFEST.in +++ /dev/null @@ -1,6 +0,0 @@ -include LICENSE -include README.md -include CHANGELOG.md -include pyproject.toml -recursive-include src *.py -recursive-include src *.typed diff --git a/benchmarks/bench_async_vs_sync.py b/benchmarks/bench_async_vs_sync.py deleted file mode 100644 index b0f2e58..0000000 --- a/benchmarks/bench_async_vs_sync.py +++ /dev/null @@ -1 +0,0 @@ -"""Benchmark: Async vs Sync performance.""" diff --git a/benchmarks/bench_batch_operations.py b/benchmarks/bench_batch_operations.py deleted file mode 100644 index 8350ccd..0000000 --- a/benchmarks/bench_batch_operations.py +++ /dev/null @@ -1 +0,0 @@ -"""Benchmark: Batch operations performance.""" diff --git a/benchmarks/bench_memory_usage.py b/benchmarks/bench_memory_usage.py deleted file mode 100644 index d34c80a..0000000 --- a/benchmarks/bench_memory_usage.py +++ /dev/null @@ -1 +0,0 @@ -"""Benchmark: Memory usage.""" diff --git a/docs/api-reference/.gitkeep b/docs/api-reference/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/docs/async_mode_guide.md b/docs/async_mode_guide.md deleted file mode 100644 index c77d8d3..0000000 --- a/docs/async_mode_guide.md +++ /dev/null @@ -1,581 +0,0 @@ -# Async Mode Guide - -## Overview - -Async mode allows non-blocking requests for both SERP and Web Unlocker APIs using Bright Data's unblocker endpoints. This enables batch operations, background processing, and better resource utilization. - -This guide covers: -- **SERP Async Mode**: Non-blocking search engine scraping -- **Web Unlocker Async Mode**: Non-blocking web page scraping - -## Sync vs Async Comparison - -| Feature | Sync Mode (Default) | Async Mode | -|---------|-------------------|------------| -| Endpoint | `/request` | `/unblocker/req` + `/unblocker/get_result` | -| Behavior | Blocks until ready | Returns immediately, polls for results | -| Use Case | Simple queries | Batch operations, background tasks | -| Response | Normalized SERP data | Same normalized SERP data | -| Configuration | None (default) | `mode="async"` | -| customer_id | Not required | Not required | - -## Key Benefits - -1. **Non-Blocking**: Continue working while scraping happens in background -2. **Batch Processing**: Trigger multiple searches, collect results later -3. **Same Data Structure**: Both modes return identical normalized data -4. **No Extra Setup**: Works with existing zones and authentication - -## Basic Usage - -### Default (Sync Mode) - -This is the existing behavior - backwards compatible: - -```python -from brightdata import BrightDataClient - -async with BrightDataClient() as client: - result = await client.search.google( - query="test", - zone="my_serp_zone" - ) - # Blocks until results ready, then returns - print(result.data) -``` - -### Async Mode - -Simply add `mode="async"`: - -```python -from brightdata import BrightDataClient - -async with BrightDataClient() as client: - result = await client.search.google( - query="test", - zone="my_serp_zone", - mode="async", # ← Enable async mode - poll_interval=2, # Check every 2 seconds - poll_timeout=30 # Give up after 30 seconds - ) - # Triggers request, polls until ready or timeout - print(result.data) -``` - -## Advanced Usage - -### Batch Operations - -Process multiple queries efficiently: - -```python -async with BrightDataClient() as client: - queries = ["python", "javascript", "golang"] - - # All queries triggered concurrently, each polled independently - results = await client.search.google( - query=queries, - zone="my_zone", - mode="async", - poll_interval=2, - poll_timeout=60 # Longer timeout for batch - ) - - for result in results: - if result.success: - print(f"Query: {result.query['q']}") - print(f"Results: {len(result.data)}") - else: - print(f"Error: {result.error}") -``` - -### With Location Parameters - -Async mode supports all the same parameters as sync: - -```python -result = await client.search.google( - query="restaurants", - zone="my_zone", - location="US", - language="en", - device="desktop", - num_results=20, - mode="async", - poll_interval=2, - poll_timeout=30 -) -``` - -### Handling Timeouts - -```python -result = await client.search.google( - query="complex query", - zone="my_zone", - mode="async", - poll_timeout=10 # Short timeout -) - -if not result.success: - if "timeout" in result.error.lower(): - print("Search timed out - try increasing poll_timeout") - else: - print(f"Error: {result.error}") -``` - -## Configuration - -### No Extra Setup Required! - -Unlike other async implementations, Bright Data's async unblocker: -- ✅ Doesn't require customer_id (derived from API token) -- ✅ Works with the same zones as sync mode -- ✅ Returns the same data structure -- ✅ Uses the same authentication - -Just add `mode="async"` to any existing SERP call. - -### Polling Parameters - -Fine-tune polling behavior: - -```python -result = await client.search.google( - query="test", - zone="my_zone", - mode="async", - poll_interval=5, # Wait 5 seconds between checks (default: 2) - poll_timeout=120 # Give up after 2 minutes (default: 30) -) -``` - -**Recommendations:** -- `poll_interval`: 2-5 seconds (balance between responsiveness and API load) -- `poll_timeout`: 30-60 seconds for single queries, 60-120 for batches - -## Performance - -### Trigger Time - -- **Sync mode**: Blocks for entire scrape (~2-5 seconds) -- **Async mode**: Returns after trigger (~0.5-1 second) - -### Total Time - -Total time is similar for both modes - the difference is whether you **block** or **poll**: - -``` -Sync: [====== WAIT ======] → Results -Async: [Trigger] ... [Poll] [Poll] [Poll] → Results - ↑ - Do other work here! -``` - -### Batch Efficiency - -Async mode shines for batches: - -```python -# Sync mode: Sequential (~15 seconds for 5 queries) -for query in queries: - result = await search(query, mode="sync") # 3s each - -# Async mode: Concurrent (~3-5 seconds for 5 queries) -results = await search(queries, mode="async") # All triggered at once -``` - -## Error Handling - -Async mode returns the same `SearchResult` structure with error handling: - -```python -result = await client.search.google( - query="test", - zone="my_zone", - mode="async", - poll_timeout=10 -) - -if result.success: - print(f"Got {len(result.data)} results") -else: - print(f"Error: {result.error}") - # Common errors: - # - "Polling timeout after 10s (response_id: ...)" - # - "Async request failed (response_id: ...)" - # - "Failed to trigger async request (no response_id received)" -``` - -## Migration Guide - -### From Sync to Async - -**Before (Sync):** -```python -result = await client.search.google(query="test", zone="my_zone") -``` - -**After (Async):** -```python -result = await client.search.google( - query="test", - zone="my_zone", - mode="async", - poll_interval=2, - poll_timeout=30 -) -``` - -### No Breaking Changes - -Existing code continues to work without modification: - -```python -# This still works exactly as before (defaults to sync mode) -result = await client.search.google(query="test", zone="my_zone") -``` - -## Supported Search Engines - -Async mode works with all SERP endpoints: - -- ✅ Google: `client.search.google()` -- ✅ Bing: `client.search.bing()` -- ✅ Yandex: `client.search.yandex()` - -All support the same `mode="async"` parameter. - -## Technical Details - -### How It Works - -1. **Trigger**: POST to `/unblocker/req?zone=X` with search URL -2. **Response ID**: Receive `x-response-id` header -3. **Poll**: GET `/unblocker/get_result?zone=X&response_id=Y` - - HTTP 202: Still pending, wait and retry - - HTTP 200: Results ready, fetch data - - Other: Error occurred -4. **Results**: Parse and normalize SERP data - -### Response Structure - -Both sync and async return the same normalized structure: - -```python -{ - "general": { - "search_engine": "google", - "query": "python programming", - "language": "en-US" - }, - "organic": [ - { - "rank": 1, - "title": "Welcome to Python.org", - "link": "https://www.python.org/", - "description": "..." - } - ], - "top_ads": [...], - "knowledge": {...} -} -``` - -## Best Practices - -1. **Use async for batches**: If processing >3 queries, async mode is more efficient -2. **Set reasonable timeouts**: Give enough time but don't wait forever -3. **Handle errors gracefully**: Check `result.success` before accessing data -4. **Monitor poll_interval**: Don't poll too aggressively (2-5s is good) -5. **Stick with sync for one-offs**: For single, simple queries, sync is simpler - -## Troubleshooting - -### "Polling timeout after 30s" - -**Cause**: Search took longer than `poll_timeout` - -**Solution**: Increase `poll_timeout` or check if query is too complex - -### "Failed to trigger async request" - -**Cause**: Trigger endpoint didn't return response_id - -**Solution**: Check zone configuration, API token validity - -### "Response not ready yet (HTTP 202)" - -**Cause**: Called fetch before results ready (shouldn't happen with polling) - -**Solution**: This is handled internally - if you see this, it's a bug - -## FAQ - -**Q: Do I need customer_id for async mode?** - -A: No! Unlike other implementations, Bright Data derives customer from your API token. - -**Q: Will async mode cost more?** - -A: No, costs are the same for both modes. - -**Q: Can I use async mode with custom zones?** - -A: Yes, async mode works with any zone that supports SERP. - -**Q: What's the difference between this and asyncio?** - -A: This is about Bright Data's API behavior (blocking vs polling), not Python's async/await. The SDK is already asyncio-based. - -**Q: Can I mix sync and async in the same code?** - -A: Yes! Choose mode per request: - -```python -result1 = await search(query1, mode="sync") # Blocking -result2 = await search(query2, mode="async") # Non-blocking -``` - ---- - -# Web Unlocker Async Mode - -## Overview - -Web Unlocker also supports async mode using the same unblocker endpoints. This enables non-blocking HTML scraping for better batch processing and resource utilization. - -## Sync vs Async for Web Unlocker - -| Feature | Sync Mode (Default) | Async Mode | -|---------|-------------------|------------| -| Endpoint | `/request` | `/unblocker/req` + `/unblocker/get_result` | -| Behavior | Blocks until ready | Returns immediately, polls for results | -| Use Case | Single page scrapes | Batch scraping, background tasks | -| Response | HTML/JSON | Same HTML/JSON | -| Configuration | None (default) | `mode="async"` | - -## Basic Usage - -### Default (Sync Mode) - -Existing behavior - backwards compatible: - -```python -from brightdata import BrightDataClient - -async with BrightDataClient() as client: - result = await client.scrape_url( - url="https://example.com", - zone="my_web_unlocker_zone" - ) - # Blocks until scraping complete - print(result.data) # HTML content -``` - -### Async Mode - -Simply add `mode="async"`: - -```python -from brightdata import BrightDataClient - -async with BrightDataClient() as client: - result = await client.scrape_url( - url="https://example.com", - zone="my_web_unlocker_zone", - mode="async", # ← Enable async mode - poll_interval=2, # Check every 2 seconds - poll_timeout=30 # Give up after 30 seconds - ) - # Triggers request, polls until ready or timeout - print(result.data) # HTML content -``` - -## Advanced Usage - -### Batch URL Scraping - -Process multiple URLs efficiently: - -```python -async with BrightDataClient() as client: - urls = [ - "https://example.com", - "https://example.org", - "https://example.net" - ] - - # All URLs triggered concurrently, each polled independently - results = await client.scrape_url( - url=urls, - zone="my_zone", - mode="async", - poll_interval=2, - poll_timeout=60 # Longer timeout for batch - ) - - for i, result in enumerate(results): - if result.success: - print(f"URL {i+1}: {len(result.data)} bytes") - else: - print(f"URL {i+1} failed: {result.error}") -``` - -### With Country and Response Format - -Async mode supports all the same parameters as sync: - -```python -result = await client.scrape_url( - url="https://api.example.com/data", - zone="my_zone", - country="US", - response_format="json", # Get JSON instead of raw HTML - mode="async", - poll_interval=2, - poll_timeout=30 -) - -if result.success: - print(result.data) # Parsed JSON dict -``` - -### Handling Timeouts - -```python -result = await client.scrape_url( - url="https://slow-site.example.com", - zone="my_zone", - mode="async", - poll_timeout=10 # Short timeout -) - -if not result.success: - if "timeout" in result.error.lower(): - print("Scraping timed out - try increasing poll_timeout") - else: - print(f"Error: {result.error}") -``` - -## Performance Characteristics - -### Trigger Time - -- **Sync mode**: Blocks for entire scrape (~2-10 seconds depending on page) -- **Async mode**: Returns after trigger (~0.5-1 second) - -### Total Time - -Similar to SERP, total time is comparable - the difference is **blocking** vs **polling**: - -``` -Sync: [====== WAIT ======] → HTML -Async: [Trigger] ... [Poll] [Poll] [Poll] → HTML - ↑ - Do other work here! -``` - -### Batch Efficiency - -Async mode excels for batch scraping: - -```python -# Sync mode: Sequential (~30 seconds for 5 URLs) -for url in urls: - result = await scrape_url(url, mode="sync") # 6s each - -# Async mode: Concurrent (~6-8 seconds for 5 URLs) -results = await scrape_url(urls, mode="async") # All triggered at once -``` - -## Error Handling - -Async mode returns the same `ScrapeResult` structure: - -```python -result = await client.scrape_url( - url="https://example.com", - zone="my_zone", - mode="async", - poll_timeout=10 -) - -if result.success: - print(f"Scraped {len(result.data)} bytes") - print(f"Root domain: {result.root_domain}") - print(f"Method: {result.method}") # "web_unlocker" -else: - print(f"Error: {result.error}") - # Common errors: - # - "Polling timeout after 10s (response_id: ...)" - # - "Async request failed (response_id: ...)" - # - "Failed to trigger async request: ..." -``` - -## Migration from Sync to Async - -**Before (Sync):** -```python -result = await client.scrape_url( - url="https://example.com", - zone="my_zone" -) -``` - -**After (Async):** -```python -result = await client.scrape_url( - url="https://example.com", - zone="my_zone", - mode="async", - poll_interval=2, - poll_timeout=30 -) -``` - -**No Breaking Changes**: Existing code continues to work (defaults to sync mode). - -## Best Practices - -1. **Use async for batches**: If scraping >3 URLs, async mode is more efficient -2. **Set reasonable timeouts**: Web scraping can be slower than SERP (30-60s recommended) -3. **Handle errors gracefully**: Always check `result.success` before accessing data -4. **Monitor poll_interval**: 2-5 seconds is optimal (don't poll too aggressively) -5. **Use sync for single pages**: For one-off scrapes, sync is simpler - -## Combining SERP and Web Unlocker Async - -You can mix both in the same workflow: - -```python -async with BrightDataClient() as client: - # Async search for URLs - search_result = await client.search.google( - query="python tutorials", - zone=client.serp_zone, - mode="async" - ) - - # Extract URLs from search results - urls = [r["link"] for r in search_result.data[:5]] - - # Batch scrape those URLs - scrape_results = await client.scrape_url( - url=urls, - zone=client.web_unlocker_zone, - mode="async", - poll_timeout=60 - ) - - for result in scrape_results: - if result.success: - print(f"Scraped: {result.url} ({len(result.data)} bytes)") -``` - -## See Also - -- [Main README](../README.md) - General SDK usage -- [SERP API Endpoints](../devdocs/serp_info.md) - Technical details about endpoints -- [Implementation Plan](../devdocs/enhancements/plan.md) - How async mode was built diff --git a/docs/guides/.gitkeep b/docs/guides/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/docs/sync_client.md b/docs/sync_client.md deleted file mode 100644 index 372f8af..0000000 --- a/docs/sync_client.md +++ /dev/null @@ -1,127 +0,0 @@ -# Sync Client - -`SyncBrightDataClient` provides a synchronous interface for the Bright Data SDK. Use it when you don't need async/await or for simpler scripts. - -## Basic Usage - -```python -from brightdata import SyncBrightDataClient - -with SyncBrightDataClient() as client: - result = client.scrape_url("https://example.com") - print(result.data) -``` - -## How It Works - -- Wraps the async `BrightDataClient` with a persistent event loop -- All methods have the same signature as the async client (without `await`) -- Uses `run_until_complete()` internally for better performance than repeated `asyncio.run()` calls - -## Available Methods - -### Client Methods - -```python -client.scrape_url(url, **kwargs) # Scrape any URL -client.test_connection() # Test API connection -client.get_account_info() # Get account info -client.list_zones() # List all zones -client.delete_zone(zone_name) # Delete a zone -``` - -### Scrape Service - -```python -# Amazon -client.scrape.amazon.products(url) -client.scrape.amazon.products_trigger(url) -client.scrape.amazon.products_status(snapshot_id) -client.scrape.amazon.products_fetch(snapshot_id) -client.scrape.amazon.reviews(url) -client.scrape.amazon.sellers(url) - -# LinkedIn -client.scrape.linkedin.profiles(url) -client.scrape.linkedin.companies(url) -client.scrape.linkedin.jobs(url) -client.scrape.linkedin.posts(url) - -# Instagram -client.scrape.instagram.profiles(url) -client.scrape.instagram.posts(url) -client.scrape.instagram.comments(url) -client.scrape.instagram.reels(url) - -# Facebook -client.scrape.facebook.posts_by_profile(url) -client.scrape.facebook.posts_by_group(url) -client.scrape.facebook.comments(url) -client.scrape.facebook.reels(url) - -# ChatGPT -client.scrape.chatgpt.prompt(prompt) -client.scrape.chatgpt.prompts(prompts) -``` - -### Search Service - -```python -client.search.google(query) -client.search.bing(query) -client.search.yandex(query) -client.search.amazon.products(keyword) -client.search.linkedin.jobs(keyword) -client.search.linkedin.profiles(**kwargs) -``` - -### Crawler Service - -```python -client.crawler.crawl(url) -client.crawler.scrape(url) -``` - -## Important Notes - -### Not Thread-Safe - -`SyncBrightDataClient` is **not thread-safe**. For multi-threaded usage, create a separate client per thread: - -```python -import threading - -def worker(): - with SyncBrightDataClient() as client: - result = client.scrape_url("https://example.com") - -threads = [threading.Thread(target=worker) for _ in range(3)] -for t in threads: - t.start() -``` - -### Cannot Use Inside Async Context - -Using `SyncBrightDataClient` inside an async function will raise an error: - -```python -# Wrong - will raise RuntimeError -async def main(): - with SyncBrightDataClient() as client: # Error! - ... - -# Correct - use async client -async def main(): - async with BrightDataClient() as client: - result = await client.scrape_url("...") -``` - -## When to Use Sync vs Async - -| Use Case | Recommended | -|----------|-------------| -| Simple scripts | `SyncBrightDataClient` | -| Jupyter notebooks | `SyncBrightDataClient` | -| Web frameworks (FastAPI, etc.) | `BrightDataClient` (async) | -| High-volume scraping | `BrightDataClient` (async) | -| Concurrent requests | `BrightDataClient` (async) | diff --git a/examples/01_simple_scrape.py b/examples/01_simple_scrape.py deleted file mode 100644 index 22791d1..0000000 --- a/examples/01_simple_scrape.py +++ /dev/null @@ -1 +0,0 @@ -"""Example: Simple scraping.""" diff --git a/examples/02_async_scrape.py b/examples/02_async_scrape.py deleted file mode 100644 index 92f8d0e..0000000 --- a/examples/02_async_scrape.py +++ /dev/null @@ -1 +0,0 @@ -"""Example: Async scraping.""" diff --git a/examples/03_batch_scraping.py b/examples/03_batch_scraping.py deleted file mode 100644 index 3fd8c2a..0000000 --- a/examples/03_batch_scraping.py +++ /dev/null @@ -1 +0,0 @@ -"""Example: Batch scraping.""" diff --git a/examples/04_specialized_scrapers.py b/examples/04_specialized_scrapers.py deleted file mode 100644 index 4917fd7..0000000 --- a/examples/04_specialized_scrapers.py +++ /dev/null @@ -1 +0,0 @@ -"""Example: Specialized scrapers.""" diff --git a/examples/05_browser_automation.py b/examples/05_browser_automation.py deleted file mode 100644 index 7a4c42c..0000000 --- a/examples/05_browser_automation.py +++ /dev/null @@ -1 +0,0 @@ -"""Example: Browser automation.""" diff --git a/examples/06_web_crawling.py b/examples/06_web_crawling.py deleted file mode 100644 index e4d29be..0000000 --- a/examples/06_web_crawling.py +++ /dev/null @@ -1 +0,0 @@ -"""Example: Web crawling.""" diff --git a/examples/07_advanced_usage.py b/examples/07_advanced_usage.py deleted file mode 100644 index e30842a..0000000 --- a/examples/07_advanced_usage.py +++ /dev/null @@ -1 +0,0 @@ -"""Example: Advanced usage.""" diff --git a/examples/08_result_models.py b/examples/08_result_models.py deleted file mode 100644 index 3528fd1..0000000 --- a/examples/08_result_models.py +++ /dev/null @@ -1,168 +0,0 @@ -"""Example: Using unified result models.""" - -from datetime import datetime -from brightdata.models import ScrapeResult, SearchResult, CrawlResult - - -def example_scrape_result(): - """Example of using ScrapeResult.""" - print("=== ScrapeResult Example ===\n") - - # Create a scrape result - result = ScrapeResult( - success=True, - url="https://www.amazon.com/dp/B0CRMZHDG8", - platform="amazon", - cost=0.001, - snapshot_id="snapshot_12345", - data={"product": "Example Product", "price": "$29.99"}, - trigger_sent_at=datetime.utcnow(), - data_fetched_at=datetime.utcnow(), - root_domain="amazon.com", - row_count=1, - ) - - print(f"Result: {result}") - print(f"Success: {result.success}") - print(f"URL: {result.url}") - print(f"Platform: {result.platform}") - print(f"Cost: ${result.cost:.4f}") - print(f"Elapsed: {result.elapsed_ms():.2f} ms") - print("\nTiming Breakdown:") - for key, value in result.get_timing_breakdown().items(): - print(f" {key}: {value}") - - # Serialize to JSON - print("\nJSON representation:") - print(result.to_json(indent=2)) - - # Save to file - result.save_to_file("scrape_result.json", format="json") - print("\nSaved to scrape_result.json") - - -def example_search_result(): - """Example of using SearchResult.""" - print("\n\n=== SearchResult Example ===\n") - - result = SearchResult( - success=True, - query={"q": "python async", "engine": "google", "country": "us"}, - search_engine="google", - country="us", - total_found=1000000, - page=1, - results_per_page=10, - data=[ - {"title": "Python AsyncIO", "url": "https://example.com/1"}, - {"title": "Async Python Guide", "url": "https://example.com/2"}, - ], - cost=0.002, - trigger_sent_at=datetime.utcnow(), - data_fetched_at=datetime.utcnow(), - ) - - print(f"Result: {result}") - print(f"Query: {result.query}") - print(f"Total Found: {result.total_found:,}") - print(f"Results: {len(result.data) if result.data else 0} items") - print(f"Cost: ${result.cost:.4f}") - - # Get timing breakdown - print("\nTiming Breakdown:") - for key, value in result.get_timing_breakdown().items(): - print(f" {key}: {value}") - - -def example_crawl_result(): - """Example of using CrawlResult.""" - print("\n\n=== CrawlResult Example ===\n") - - result = CrawlResult( - success=True, - domain="example.com", - start_url="https://example.com", - total_pages=5, - depth=2, - pages=[ - {"url": "https://example.com/page1", "status": 200, "data": {}}, - {"url": "https://example.com/page2", "status": 200, "data": {}}, - ], - cost=0.005, - crawl_started_at=datetime.utcnow(), - crawl_completed_at=datetime.utcnow(), - ) - - print(f"Result: {result}") - print(f"Domain: {result.domain}") - print(f"Total Pages: {result.total_pages}") - print(f"Depth: {result.depth}") - print(f"Pages Crawled: {len(result.pages)}") - print(f"Cost: ${result.cost:.4f}") - - # Get timing breakdown - print("\nTiming Breakdown:") - for key, value in result.get_timing_breakdown().items(): - print(f" {key}: {value}") - - -def example_error_handling(): - """Example of error handling with result models.""" - print("\n\n=== Error Handling Example ===\n") - - # Failed scrape - error_result = ScrapeResult( - success=False, - url="https://example.com/failed", - status="error", - error="Connection timeout after 30 seconds", - cost=0.0, # No charge for failed requests - trigger_sent_at=datetime.utcnow(), - data_fetched_at=datetime.utcnow(), - ) - - print(f"Error Result: {error_result}") - print(f"Success: {error_result.success}") - print(f"Error: {error_result.error}") - print(f"Cost: ${error_result.cost:.4f}") - - # Check if operation succeeded - if not error_result.success: - print(f"\nOperation failed: {error_result.error}") - print("Timing information still available:") - print(error_result.get_timing_breakdown()) - - -def example_serialization(): - """Example of serialization methods.""" - print("\n\n=== Serialization Example ===\n") - - result = ScrapeResult( - success=True, - url="https://example.com", - cost=0.001, - data={"key": "value"}, - ) - - # Convert to dictionary - result_dict = result.to_dict() - print("Dictionary representation:") - print(result_dict) - - # Convert to JSON - json_str = result.to_json(indent=2) - print("\nJSON representation:") - print(json_str) - - # Save to different formats - result.save_to_file("result.json", format="json") - result.save_to_file("result.txt", format="txt") - print("\nSaved to result.json and result.txt") - - -if __name__ == "__main__": - example_scrape_result() - example_search_result() - example_crawl_result() - example_error_handling() - example_serialization() diff --git a/examples/09_result_models_demo.py b/examples/09_result_models_demo.py deleted file mode 100644 index 3c0719e..0000000 --- a/examples/09_result_models_demo.py +++ /dev/null @@ -1,105 +0,0 @@ -"""Demo: Result models functionality demonstration.""" - -from datetime import datetime, timezone -from brightdata.models import BaseResult, ScrapeResult, SearchResult, CrawlResult - -print("=" * 60) -print("RESULT MODELS DEMONSTRATION") -print("=" * 60) - -# Test BaseResult -print("\n1. BaseResult:") -r = BaseResult(success=True, cost=0.001) -print(f" Created: {r}") -print(f" success: {r.success}") -print(f" cost: ${r.cost}") -print(f" error: {r.error}") -print(f" to_json(): {r.to_json()[:80]}...") - -# Test with timing -now = datetime.now(timezone.utc) -r2 = BaseResult( - success=True, - cost=0.002, - trigger_sent_at=now, - data_fetched_at=now, -) -print(f" elapsed_ms: {r2.elapsed_ms()}") -print(f" get_timing_breakdown: {list(r2.get_timing_breakdown().keys())}") - -# Test ScrapeResult -print("\n2. ScrapeResult:") -scrape = ScrapeResult( - success=True, - url="https://www.linkedin.com/in/test", - status="ready", - platform="linkedin", - cost=0.001, - trigger_sent_at=now, - data_fetched_at=now, -) -print(f" Created: {scrape}") -print(f" url: {scrape.url}") -print(f" platform: {scrape.platform}") -print(f" status: {scrape.status}") -print(f" get_timing_breakdown: {list(scrape.get_timing_breakdown().keys())}") - -# Test SearchResult -print("\n3. SearchResult:") -search = SearchResult( - success=True, - query={"q": "python async", "engine": "google"}, - total_found=1000, - search_engine="google", - cost=0.002, -) -print(f" Created: {search}") -print(f" query: {search.query}") -print(f" total_found: {search.total_found}") -print(f" search_engine: {search.search_engine}") - -# Test CrawlResult -print("\n4. CrawlResult:") -crawl = CrawlResult( - success=True, - domain="example.com", - pages=[{"url": "https://example.com/page1", "data": {}}], - total_pages=1, - cost=0.005, -) -print(f" Created: {crawl}") -print(f" domain: {crawl.domain}") -print(f" pages: {len(crawl.pages)}") -print(f" total_pages: {crawl.total_pages}") - -# Test utilities -print("\n5. Utilities:") -print(f" BaseResult.to_json(): {len(r.to_json())} chars") -print(f" ScrapeResult.to_json(): {len(scrape.to_json())} chars") -print(f" SearchResult.to_json(): {len(search.to_json())} chars") -print(f" CrawlResult.to_json(): {len(crawl.to_json())} chars") - -# Test interface requirements -print("\n6. Interface Requirements:") -print(" Common fields:") -print(f" result.success: {r.success} (bool)") -print(f" result.cost: ${r.cost} (float)") -print(f" result.error: {r.error} (str | None)") -print(f" result.trigger_sent_at: {r.trigger_sent_at} (datetime)") -print(f" result.data_fetched_at: {r.data_fetched_at} (datetime)") - -print("\n Service-specific fields:") -print(f" scrape_result.url: {scrape.url}") -print(f" scrape_result.platform: {scrape.platform}") -print(f" search_result.query: {search.query}") -print(f" search_result.total_found: {search.total_found}") -print(f" crawl_result.domain: {crawl.domain}") -print(f" crawl_result.pages: {len(crawl.pages)} items") - -print("\n Utilities:") -print(f" result.to_json(): {r.to_json()[:50]}...") -print(f" result.get_timing_breakdown(): {len(r2.get_timing_breakdown())} keys") - -print("\n" + "=" * 60) -print("ALL TESTS PASSED - FUNCTIONALITY VERIFIED!") -print("=" * 60) diff --git a/examples/10_pandas_integration.py b/examples/pandas_integration.py similarity index 100% rename from examples/10_pandas_integration.py rename to examples/pandas_integration.py diff --git a/examples/11_trigger_interface.py b/examples/trigger_interface.py similarity index 100% rename from examples/11_trigger_interface.py rename to examples/trigger_interface.py diff --git a/notebooks/test_v2.1.0_release.ipynb b/notebooks/test_v2.1.0_release.ipynb index 45242d5..76e8326 100644 --- a/notebooks/test_v2.1.0_release.ipynb +++ b/notebooks/test_v2.1.0_release.ipynb @@ -111,7 +111,7 @@ ], "source": [ "from brightdata import BrightDataClient, SyncBrightDataClient\n", - "from brightdata.api.async_unblocker import AsyncUnblockerClient\n", + "from brightdata.web_unlocker.async_client import AsyncUnblockerClient\n", "print(\"All imports successful!\")" ] }, @@ -170,7 +170,7 @@ ], "source": [ "import inspect\n", - "from brightdata.api.web_unlocker import WebUnlockerService\n", + "from brightdata.web_unlocker.service import WebUnlockerService\n", "\n", "# Check WebUnlocker has mode parameter\n", "sig = inspect.signature(WebUnlockerService.scrape)\n", diff --git a/pyproject.toml b/pyproject.toml index 1d0f90e..e2bb766 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ where = ["src"] [project] name = "brightdata-sdk" -version = "2.2.2" +version = "2.3.0" description = "Modern async-first Python SDK for Bright Data APIs" authors = [{name = "Bright Data", email = "support@brightdata.com"}] license = {text = "MIT"} @@ -51,7 +51,6 @@ target-version = ['py39'] line-length = 100 target-version = "py39" exclude = [ - "probe_tests", "notebooks", "examples", ".git", diff --git a/requirements-dev.txt b/requirements-dev.txt deleted file mode 100644 index 431ef73..0000000 --- a/requirements-dev.txt +++ /dev/null @@ -1,9 +0,0 @@ --r requirements.txt -pytest>=7.4.0 -pytest-asyncio>=0.21.0 -pytest-cov>=4.1.0 -pytest-mock>=3.11.0 -black>=23.0.0 -ruff>=0.1.0 -mypy>=1.5.0 -pre-commit>=3.4.0 diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index ba3dedb..0000000 --- a/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -aiohttp>=3.9.0 -requests>=2.31.0 -python-dotenv>=1.0.0 -tldextract>=5.0.0 -pydantic>=2.0.0 -pydantic-settings>=2.0.0 -click>=8.1.0 diff --git a/src/brightdata/__init__.py b/src/brightdata/__init__.py index 3408909..7772103 100644 --- a/src/brightdata/__init__.py +++ b/src/brightdata/__init__.py @@ -66,7 +66,6 @@ ValidationError, AuthenticationError, APIError, - TimeoutError, ZoneError, NetworkError, SSLError, @@ -76,9 +75,9 @@ from .scraper_studio.models import ScraperStudioJob, JobStatus # Export services for advanced usage -from .api.web_unlocker import WebUnlockerService -from .api.scraper_studio_service import ScraperStudioService -from .api.browser_service import BrowserService +from .web_unlocker.service import WebUnlockerService +from .scraper_studio.service import ScraperStudioService +from .browser.service import BrowserService from .core.zone_manager import ZoneManager __all__ = [ @@ -125,7 +124,6 @@ "ValidationError", "AuthenticationError", "APIError", - "TimeoutError", "ZoneError", "NetworkError", "SSLError", diff --git a/src/brightdata/_internal/__init__.py b/src/brightdata/_internal/__init__.py deleted file mode 100644 index 678630a..0000000 --- a/src/brightdata/_internal/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Private implementation details.""" diff --git a/src/brightdata/_internal/compat.py b/src/brightdata/_internal/compat.py deleted file mode 100644 index a3db63e..0000000 --- a/src/brightdata/_internal/compat.py +++ /dev/null @@ -1 +0,0 @@ -"""Python version compatibility (if needed).""" diff --git a/src/brightdata/api/__init__.py b/src/brightdata/api/__init__.py deleted file mode 100644 index ef85d83..0000000 --- a/src/brightdata/api/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""API implementations.""" diff --git a/src/brightdata/api/crawl.py b/src/brightdata/api/crawl.py deleted file mode 100644 index 4bf927e..0000000 --- a/src/brightdata/api/crawl.py +++ /dev/null @@ -1 +0,0 @@ -"""Web Crawl API.""" diff --git a/src/brightdata/api/datasets.py b/src/brightdata/api/datasets.py deleted file mode 100644 index 9efcb84..0000000 --- a/src/brightdata/api/datasets.py +++ /dev/null @@ -1 +0,0 @@ -"""Datasets API.""" diff --git a/src/brightdata/api/download.py b/src/brightdata/api/download.py deleted file mode 100644 index b4a2786..0000000 --- a/src/brightdata/api/download.py +++ /dev/null @@ -1 +0,0 @@ -"""Download/snapshot operations.""" diff --git a/src/brightdata/auto.py b/src/brightdata/auto.py deleted file mode 100644 index 38833c6..0000000 --- a/src/brightdata/auto.py +++ /dev/null @@ -1 +0,0 @@ -"""Simplified one-liner API for common use cases.""" diff --git a/src/brightdata/browser/__init__.py b/src/brightdata/browser/__init__.py new file mode 100644 index 0000000..77d190f --- /dev/null +++ b/src/brightdata/browser/__init__.py @@ -0,0 +1,5 @@ +"""Browser API service.""" + +from .service import BrowserService + +__all__ = ["BrowserService"] diff --git a/src/brightdata/api/browser_service.py b/src/brightdata/browser/service.py similarity index 100% rename from src/brightdata/api/browser_service.py rename to src/brightdata/browser/service.py diff --git a/src/brightdata/client.py b/src/brightdata/client.py index 02bdc98..5cf6841 100644 --- a/src/brightdata/client.py +++ b/src/brightdata/client.py @@ -23,20 +23,16 @@ from .core.engine import AsyncEngine from .core.zone_manager import ZoneManager -from .api.web_unlocker import WebUnlockerService -from .api.scrape_service import ScrapeService -from .api.search_service import SearchService -from .api.crawler_service import CrawlerService -from .api.scraper_studio_service import ScraperStudioService -from .api.browser_service import BrowserService +from .web_unlocker.service import WebUnlockerService +from .scrapers.service import ScrapeService +from .serp.service import SearchService +from .crawler.service import CrawlerService +from .scraper_studio.service import ScraperStudioService +from .browser.service import BrowserService from .datasets import DatasetsClient from .models import ScrapeResult from .types import AccountInfo -from .constants import ( - HTTP_OK, - HTTP_UNAUTHORIZED, - HTTP_FORBIDDEN, -) +from http import HTTPStatus from .exceptions import ValidationError, AuthenticationError, APIError @@ -420,7 +416,7 @@ async def test_connection(self) -> bool: async with self.engine.get_from_url( f"{self.engine.BASE_URL}/zone/get_active_zones" ) as response: - if response.status == HTTP_OK: + if response.status == HTTPStatus.OK: self._is_connected = True return True else: @@ -475,7 +471,7 @@ async def get_account_info(self, refresh: bool = False) -> AccountInfo: async with self.engine.get_from_url( f"{self.engine.BASE_URL}/zone/get_active_zones" ) as zones_response: - if zones_response.status == HTTP_OK: + if zones_response.status == HTTPStatus.OK: zones = await zones_response.json() zones = zones or [] @@ -501,7 +497,7 @@ async def get_account_info(self, refresh: bool = False) -> AccountInfo: self._account_info = account_info return account_info - elif zones_response.status in (HTTP_UNAUTHORIZED, HTTP_FORBIDDEN): + elif zones_response.status in (HTTPStatus.UNAUTHORIZED, HTTPStatus.FORBIDDEN): error_text = await zones_response.text() raise AuthenticationError( f"Invalid token (HTTP {zones_response.status}): {error_text}" diff --git a/src/brightdata/config.py b/src/brightdata/config.py deleted file mode 100644 index c6b6bf9..0000000 --- a/src/brightdata/config.py +++ /dev/null @@ -1 +0,0 @@ -"""Configuration (Pydantic Settings).""" diff --git a/src/brightdata/constants.py b/src/brightdata/constants.py index 369f2d9..bcb845d 100644 --- a/src/brightdata/constants.py +++ b/src/brightdata/constants.py @@ -52,28 +52,3 @@ SCRAPER_STUDIO_POLL_INTERVAL: int = 10 """Default interval in seconds between poll attempts for Scraper Studio.""" - -# HTTP Status Codes -HTTP_OK: int = 200 -"""HTTP 200 OK - Request succeeded.""" - -HTTP_CREATED: int = 201 -"""HTTP 201 Created - Resource created successfully.""" - -HTTP_ACCEPTED: int = 202 -"""HTTP 202 Accepted - Request accepted for async processing.""" - -HTTP_BAD_REQUEST: int = 400 -"""HTTP 400 Bad Request - Invalid request parameters.""" - -HTTP_UNAUTHORIZED: int = 401 -"""HTTP 401 Unauthorized - Authentication required or failed.""" - -HTTP_FORBIDDEN: int = 403 -"""HTTP 403 Forbidden - Access denied.""" - -HTTP_CONFLICT: int = 409 -"""HTTP 409 Conflict - Resource conflict (e.g., duplicate).""" - -HTTP_INTERNAL_SERVER_ERROR: int = 500 -"""HTTP 500 Internal Server Error - Server error.""" diff --git a/src/brightdata/core/auth.py b/src/brightdata/core/auth.py deleted file mode 100644 index 814baa4..0000000 --- a/src/brightdata/core/auth.py +++ /dev/null @@ -1 +0,0 @@ -"""Authentication handling.""" diff --git a/src/brightdata/core/engine.py b/src/brightdata/core/engine.py index e9b0858..9fa058d 100644 --- a/src/brightdata/core/engine.py +++ b/src/brightdata/core/engine.py @@ -6,8 +6,8 @@ import warnings from typing import Optional, Dict, Any from .. import __version__ -from ..exceptions import AuthenticationError, NetworkError, TimeoutError, SSLError -from ..constants import HTTP_UNAUTHORIZED, HTTP_FORBIDDEN +from ..exceptions import AuthenticationError, NetworkError, SSLError +from http import HTTPStatus from ..utils.ssl_helpers import is_ssl_certificate_error, get_ssl_error_message # Rate limiting support @@ -383,16 +383,24 @@ async def __aenter__(self): timeout=self._timeout, ) # Check status codes that should raise exceptions - if self._response.status == HTTP_UNAUTHORIZED: + if self._response.status == HTTPStatus.UNAUTHORIZED: text = await self._response.text() await self._response.release() - raise AuthenticationError(f"Unauthorized ({HTTP_UNAUTHORIZED}): {text}") - elif self._response.status == HTTP_FORBIDDEN: + raise AuthenticationError( + f"Unauthorized ({HTTPStatus.UNAUTHORIZED}): {text}" + ) + elif self._response.status == HTTPStatus.FORBIDDEN: text = await self._response.text() await self._response.release() - raise AuthenticationError(f"Forbidden ({HTTP_FORBIDDEN}): {text}") + raise AuthenticationError(f"Forbidden ({HTTPStatus.FORBIDDEN}): {text}") return self._response + except asyncio.TimeoutError as e: + # Must be caught before OSError — on Python 3.11+, + # TimeoutError is a subclass of OSError + raise TimeoutError( + f"Request timeout after {self._timeout.total} seconds" + ) from e except (aiohttp.ClientError, ssl.SSLError, OSError) as e: # Check for SSL certificate errors first # aiohttp wraps SSL errors in ClientConnectorError or ClientSSLError @@ -402,10 +410,6 @@ async def __aenter__(self): raise SSLError(error_message) from e # Other network errors raise NetworkError(f"Network error: {str(e)}") from e - except asyncio.TimeoutError as e: - raise TimeoutError( - f"Request timeout after {self._timeout.total} seconds" - ) from e async def __aexit__(self, exc_type, exc_val, exc_tb): if self._response: diff --git a/src/brightdata/core/hooks.py b/src/brightdata/core/hooks.py deleted file mode 100644 index 24564ad..0000000 --- a/src/brightdata/core/hooks.py +++ /dev/null @@ -1 +0,0 @@ -"""Event hooks system.""" diff --git a/src/brightdata/core/logging.py b/src/brightdata/core/logging.py deleted file mode 100644 index 139de09..0000000 --- a/src/brightdata/core/logging.py +++ /dev/null @@ -1 +0,0 @@ -"""Structured logging.""" diff --git a/src/brightdata/core/zone_manager.py b/src/brightdata/core/zone_manager.py index 087053d..b0a7b6f 100644 --- a/src/brightdata/core/zone_manager.py +++ b/src/brightdata/core/zone_manager.py @@ -7,16 +7,8 @@ import logging import aiohttp from typing import List, Dict, Any, Optional, Tuple +from http import HTTPStatus from ..exceptions.errors import ZoneError, APIError, AuthenticationError -from ..constants import ( - HTTP_OK, - HTTP_CREATED, - HTTP_BAD_REQUEST, - HTTP_UNAUTHORIZED, - HTTP_FORBIDDEN, - HTTP_CONFLICT, - HTTP_INTERNAL_SERVER_ERROR, -) logger = logging.getLogger(__name__) @@ -141,10 +133,10 @@ async def _get_zones(self) -> List[Dict[str, Any]]: for attempt in range(max_retries): try: async with self.engine.get("/zone/get_active_zones") as response: - if response.status == HTTP_OK: + if response.status == HTTPStatus.OK: zones = await response.json() return zones or [] - elif response.status in (HTTP_UNAUTHORIZED, HTTP_FORBIDDEN): + elif response.status in (HTTPStatus.UNAUTHORIZED, HTTPStatus.FORBIDDEN): error_text = await response.text() raise AuthenticationError( f"Authentication failed ({response.status}): {error_text}" @@ -153,7 +145,7 @@ async def _get_zones(self) -> List[Dict[str, Any]]: error_text = await response.text() if ( attempt < max_retries - 1 - and response.status >= HTTP_INTERNAL_SERVER_ERROR + and response.status >= HTTPStatus.INTERNAL_SERVER_ERROR ): logger.warning( f"Zone list request failed (attempt {attempt + 1}/{max_retries}): " @@ -201,10 +193,10 @@ async def _create_zone(self, zone_name: str, zone_type: str) -> None: for attempt in range(max_retries): try: async with self.engine.post("/zone", json_data=payload) as response: - if response.status in (HTTP_OK, HTTP_CREATED): + if response.status in (HTTPStatus.OK, HTTPStatus.CREATED): logger.info(f"Zone creation successful: {zone_name}") return - elif response.status == HTTP_CONFLICT: + elif response.status == HTTPStatus.CONFLICT: # Zone already exists - this is fine logger.info(f"Zone {zone_name} already exists - this is expected") return @@ -220,7 +212,7 @@ async def _create_zone(self, zone_name: str, zone_type: str) -> None: return # Handle authentication/permission errors - if response.status in (HTTP_UNAUTHORIZED, HTTP_FORBIDDEN): + if response.status in (HTTPStatus.UNAUTHORIZED, HTTPStatus.FORBIDDEN): # Check for specific permission error if ( "permission" in error_text.lower() @@ -254,15 +246,15 @@ async def _create_zone(self, zone_name: str, zone_type: str) -> None: ) # Handle bad request - if response.status == HTTP_BAD_REQUEST: + if response.status == HTTPStatus.BAD_REQUEST: raise ZoneError( - f"Bad request ({HTTP_BAD_REQUEST}) creating zone '{zone_name}': {error_text}" + f"Bad request ({HTTPStatus.BAD_REQUEST}) creating zone '{zone_name}': {error_text}" ) # Retry on server errors if ( attempt < max_retries - 1 - and response.status >= HTTP_INTERNAL_SERVER_ERROR + and response.status >= HTTPStatus.INTERNAL_SERVER_ERROR ): logger.warning( f"Zone creation failed (attempt {attempt + 1}/{max_retries}): " @@ -398,15 +390,15 @@ async def delete_zone(self, zone_name: str) -> None: payload = {"zone": zone_name} async with self.engine.delete("/zone", json_data=payload) as response: - if response.status == HTTP_OK: + if response.status == HTTPStatus.OK: logger.info(f"Zone '{zone_name}' successfully deleted") return - elif response.status in (HTTP_UNAUTHORIZED, HTTP_FORBIDDEN): + elif response.status in (HTTPStatus.UNAUTHORIZED, HTTPStatus.FORBIDDEN): error_text = await response.text() raise AuthenticationError( f"Authentication failed ({response.status}) deleting zone '{zone_name}': {error_text}" ) - elif response.status == HTTP_BAD_REQUEST: + elif response.status == HTTPStatus.BAD_REQUEST: error_text = await response.text() # Check if zone doesn't exist if ( @@ -417,7 +409,7 @@ async def delete_zone(self, zone_name: str) -> None: f"Zone '{zone_name}' does not exist or has already been deleted" ) raise ZoneError( - f"Bad request ({HTTP_BAD_REQUEST}) deleting zone '{zone_name}': {error_text}" + f"Bad request ({HTTPStatus.BAD_REQUEST}) deleting zone '{zone_name}': {error_text}" ) else: error_text = await response.text() @@ -425,7 +417,7 @@ async def delete_zone(self, zone_name: str) -> None: # Retry on server errors if ( attempt < max_retries - 1 - and response.status >= HTTP_INTERNAL_SERVER_ERROR + and response.status >= HTTPStatus.INTERNAL_SERVER_ERROR ): logger.warning( f"Zone deletion failed (attempt {attempt + 1}/{max_retries}): " diff --git a/src/brightdata/crawler/__init__.py b/src/brightdata/crawler/__init__.py new file mode 100644 index 0000000..1f37b7a --- /dev/null +++ b/src/brightdata/crawler/__init__.py @@ -0,0 +1,5 @@ +"""Crawler service.""" + +from .service import CrawlerService + +__all__ = ["CrawlerService"] diff --git a/src/brightdata/api/crawler_service.py b/src/brightdata/crawler/service.py similarity index 100% rename from src/brightdata/api/crawler_service.py rename to src/brightdata/crawler/service.py diff --git a/src/brightdata/datasets/agoda/properties.py b/src/brightdata/datasets/agoda/properties.py index aa43f2d..5d66e37 100644 --- a/src/brightdata/datasets/agoda/properties.py +++ b/src/brightdata/datasets/agoda/properties.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class AgodaProperties(BaseDataset): diff --git a/src/brightdata/datasets/airbnb/properties.py b/src/brightdata/datasets/airbnb/properties.py index 9a35869..a03a561 100644 --- a/src/brightdata/datasets/airbnb/properties.py +++ b/src/brightdata/datasets/airbnb/properties.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class AirbnbProperties(BaseDataset): diff --git a/src/brightdata/datasets/amazon/best_sellers.py b/src/brightdata/datasets/amazon/best_sellers.py index c6f1c55..7840898 100644 --- a/src/brightdata/datasets/amazon/best_sellers.py +++ b/src/brightdata/datasets/amazon/best_sellers.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class AmazonBestSellers(BaseDataset): diff --git a/src/brightdata/datasets/amazon/products.py b/src/brightdata/datasets/amazon/products.py index b2680d7..058bddc 100644 --- a/src/brightdata/datasets/amazon/products.py +++ b/src/brightdata/datasets/amazon/products.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class AmazonProducts(BaseDataset): diff --git a/src/brightdata/datasets/amazon/products_global.py b/src/brightdata/datasets/amazon/products_global.py index 5e3e8b1..ad889d6 100644 --- a/src/brightdata/datasets/amazon/products_global.py +++ b/src/brightdata/datasets/amazon/products_global.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class AmazonProductsGlobal(BaseDataset): diff --git a/src/brightdata/datasets/amazon/products_search.py b/src/brightdata/datasets/amazon/products_search.py index cfa1832..fa3be9f 100644 --- a/src/brightdata/datasets/amazon/products_search.py +++ b/src/brightdata/datasets/amazon/products_search.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class AmazonProductsSearch(BaseDataset): diff --git a/src/brightdata/datasets/amazon/reviews.py b/src/brightdata/datasets/amazon/reviews.py index 256cd3b..5d7ea24 100644 --- a/src/brightdata/datasets/amazon/reviews.py +++ b/src/brightdata/datasets/amazon/reviews.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class AmazonReviews(BaseDataset): diff --git a/src/brightdata/datasets/amazon/sellers.py b/src/brightdata/datasets/amazon/sellers.py index 417aa11..7e44c68 100644 --- a/src/brightdata/datasets/amazon/sellers.py +++ b/src/brightdata/datasets/amazon/sellers.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class AmazonSellersInfo(BaseDataset): diff --git a/src/brightdata/datasets/amazon/walmart.py b/src/brightdata/datasets/amazon/walmart.py index a58b395..ddfee5b 100644 --- a/src/brightdata/datasets/amazon/walmart.py +++ b/src/brightdata/datasets/amazon/walmart.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class AmazonWalmart(BaseDataset): diff --git a/src/brightdata/datasets/american_eagle/products.py b/src/brightdata/datasets/american_eagle/products.py index 5469158..15808ff 100644 --- a/src/brightdata/datasets/american_eagle/products.py +++ b/src/brightdata/datasets/american_eagle/products.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class AmericanEagleProducts(BaseDataset): diff --git a/src/brightdata/datasets/apple_appstore/reviews.py b/src/brightdata/datasets/apple_appstore/reviews.py index 5e9fe9e..0373901 100644 --- a/src/brightdata/datasets/apple_appstore/reviews.py +++ b/src/brightdata/datasets/apple_appstore/reviews.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class AppleAppStoreReviews(BaseDataset): diff --git a/src/brightdata/datasets/apple_appstore/store.py b/src/brightdata/datasets/apple_appstore/store.py index 2bf9b47..653a961 100644 --- a/src/brightdata/datasets/apple_appstore/store.py +++ b/src/brightdata/datasets/apple_appstore/store.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class AppleAppStore(BaseDataset): diff --git a/src/brightdata/datasets/ashley_furniture/products.py b/src/brightdata/datasets/ashley_furniture/products.py index df41b74..b944567 100644 --- a/src/brightdata/datasets/ashley_furniture/products.py +++ b/src/brightdata/datasets/ashley_furniture/products.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class AshleyFurnitureProducts(BaseDataset): diff --git a/src/brightdata/datasets/asos/products.py b/src/brightdata/datasets/asos/products.py index a8e2959..05c6ae0 100644 --- a/src/brightdata/datasets/asos/products.py +++ b/src/brightdata/datasets/asos/products.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class AsosProducts(BaseDataset): diff --git a/src/brightdata/datasets/autozone/products.py b/src/brightdata/datasets/autozone/products.py index bd46151..87ed189 100644 --- a/src/brightdata/datasets/autozone/products.py +++ b/src/brightdata/datasets/autozone/products.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class AutozoneProducts(BaseDataset): diff --git a/src/brightdata/datasets/balenciaga/products.py b/src/brightdata/datasets/balenciaga/products.py index 2c0a6c1..4224d5d 100644 --- a/src/brightdata/datasets/balenciaga/products.py +++ b/src/brightdata/datasets/balenciaga/products.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class BalenciagaProducts(BaseDataset): diff --git a/src/brightdata/datasets/base.py b/src/brightdata/datasets/base.py index 7beb7fb..3c2bebc 100644 --- a/src/brightdata/datasets/base.py +++ b/src/brightdata/datasets/base.py @@ -9,7 +9,7 @@ from .models import DatasetMetadata, SnapshotStatus if TYPE_CHECKING: - from ..core.async_engine import AsyncEngine + from ..core.engine import AsyncEngine class DatasetError(Exception): diff --git a/src/brightdata/datasets/bbc/news.py b/src/brightdata/datasets/bbc/news.py index e53fa35..0ea7527 100644 --- a/src/brightdata/datasets/bbc/news.py +++ b/src/brightdata/datasets/bbc/news.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class BBCNews(BaseDataset): diff --git a/src/brightdata/datasets/berluti/products.py b/src/brightdata/datasets/berluti/products.py index 4eb746e..daee0f7 100644 --- a/src/brightdata/datasets/berluti/products.py +++ b/src/brightdata/datasets/berluti/products.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class BerlutiProducts(BaseDataset): diff --git a/src/brightdata/datasets/bestbuy/products.py b/src/brightdata/datasets/bestbuy/products.py index 3deb6f1..ffe30bc 100644 --- a/src/brightdata/datasets/bestbuy/products.py +++ b/src/brightdata/datasets/bestbuy/products.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class BestBuyProducts(BaseDataset): diff --git a/src/brightdata/datasets/bh/products.py b/src/brightdata/datasets/bh/products.py index 6f12ff8..7effc5b 100644 --- a/src/brightdata/datasets/bh/products.py +++ b/src/brightdata/datasets/bh/products.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class BHProducts(BaseDataset): diff --git a/src/brightdata/datasets/bluesky/posts.py b/src/brightdata/datasets/bluesky/posts.py index a7bbf49..a4454b9 100644 --- a/src/brightdata/datasets/bluesky/posts.py +++ b/src/brightdata/datasets/bluesky/posts.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class BlueskyPosts(BaseDataset): diff --git a/src/brightdata/datasets/bluesky/top_profiles.py b/src/brightdata/datasets/bluesky/top_profiles.py index 299f101..3b8dbe9 100644 --- a/src/brightdata/datasets/bluesky/top_profiles.py +++ b/src/brightdata/datasets/bluesky/top_profiles.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class BlueskyTopProfiles(BaseDataset): diff --git a/src/brightdata/datasets/booking/hotel_listings.py b/src/brightdata/datasets/booking/hotel_listings.py index 43afa1c..b00260c 100644 --- a/src/brightdata/datasets/booking/hotel_listings.py +++ b/src/brightdata/datasets/booking/hotel_listings.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class BookingHotelListings(BaseDataset): diff --git a/src/brightdata/datasets/booking/listings_search.py b/src/brightdata/datasets/booking/listings_search.py index edf9abe..cbc4125 100644 --- a/src/brightdata/datasets/booking/listings_search.py +++ b/src/brightdata/datasets/booking/listings_search.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class BookingListingsSearch(BaseDataset): diff --git a/src/brightdata/datasets/bottegaveneta/products.py b/src/brightdata/datasets/bottegaveneta/products.py index d234c92..66b1282 100644 --- a/src/brightdata/datasets/bottegaveneta/products.py +++ b/src/brightdata/datasets/bottegaveneta/products.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class BottegaVenetaProducts(BaseDataset): diff --git a/src/brightdata/datasets/carsales/listings.py b/src/brightdata/datasets/carsales/listings.py index 0933b81..95fbba9 100644 --- a/src/brightdata/datasets/carsales/listings.py +++ b/src/brightdata/datasets/carsales/listings.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class CarsalesListings(BaseDataset): diff --git a/src/brightdata/datasets/carters/products.py b/src/brightdata/datasets/carters/products.py index 5a38bde..4741307 100644 --- a/src/brightdata/datasets/carters/products.py +++ b/src/brightdata/datasets/carters/products.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class CartersProducts(BaseDataset): diff --git a/src/brightdata/datasets/celine/products.py b/src/brightdata/datasets/celine/products.py index aef3ca6..a65109c 100644 --- a/src/brightdata/datasets/celine/products.py +++ b/src/brightdata/datasets/celine/products.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class CelineProducts(BaseDataset): diff --git a/src/brightdata/datasets/chanel/products.py b/src/brightdata/datasets/chanel/products.py index 83c2a09..5684ff2 100644 --- a/src/brightdata/datasets/chanel/products.py +++ b/src/brightdata/datasets/chanel/products.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class ChanelProducts(BaseDataset): diff --git a/src/brightdata/datasets/chileautos/cars.py b/src/brightdata/datasets/chileautos/cars.py index c8c6d36..1cc3749 100644 --- a/src/brightdata/datasets/chileautos/cars.py +++ b/src/brightdata/datasets/chileautos/cars.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class ChileautosChile(BaseDataset): diff --git a/src/brightdata/datasets/client.py b/src/brightdata/datasets/client.py index 8759762..a332da7 100644 --- a/src/brightdata/datasets/client.py +++ b/src/brightdata/datasets/client.py @@ -1,165 +1,223 @@ """ Datasets client - main entry point for datasets API. + +Uses a registry + __getattr__ for lazy-loading dataset instances. +IDE autocomplete is provided via the companion client.pyi stub file. """ -from typing import List, Optional, TYPE_CHECKING +import importlib +from typing import List, TYPE_CHECKING from .models import DatasetInfo -from .linkedin import ( - LinkedInPeopleProfiles, - LinkedInCompanyProfiles, - LinkedInJobListings, - LinkedInPosts, - LinkedInProfilesJobListings, -) -from .amazon import ( - AmazonProducts, - AmazonReviews, - AmazonSellersInfo, - AmazonBestSellers, - AmazonProductsSearch, - AmazonProductsGlobal, - AmazonWalmart, -) -from .crunchbase import CrunchbaseCompanies -from .imdb import IMDBMovies -from .nba import NBAPlayersStats -from .goodreads import GoodreadsBooks -from .world_population import WorldPopulation -from .companies_enriched import CompaniesEnriched -from .employees_enriched import EmployeesEnriched -from .glassdoor import GlassdoorCompanies, GlassdoorReviews, GlassdoorJobs -from .google_maps import GoogleMapsReviews, GoogleMapsFullInfo -from .yelp import YelpBusinesses, YelpReviews -from .zoominfo import ZoomInfoCompanies -from .pitchbook import PitchBookCompanies -from .g2 import G2Products, G2Reviews -from .trustpilot import TrustpilotReviews -from .indeed import IndeedCompanies, IndeedJobs -from .xing import XingProfiles -from .slintel import SlintelCompanies -from .owler import OwlerCompanies -from .lawyers import USLawyers -from .manta import MantaBusinesses -from .ventureradar import VentureRadarCompanies -from .trustradius import TrustRadiusReviews -from .instagram import InstagramProfiles, InstagramPosts, InstagramComments, InstagramReels -from .tiktok import TikTokProfiles, TikTokComments, TikTokPosts, TikTokShop -from .real_estate import AustraliaRealEstate -from .walmart import WalmartProducts, WalmartSellersInfo -from .mediamarkt import MediamarktProducts -from .fendi import FendiProducts -from .zalando import ZalandoProducts -from .sephora import SephoraProducts -from .zara import ZaraProducts, ZaraHomeProducts -from .mango import MangoProducts -from .massimo_dutti import MassimoDuttiProducts -from .otodom import OtodomPoland -from .webmotors import WebmotorsBrasil -from .airbnb import AirbnbProperties -from .asos import AsosProducts -from .chanel import ChanelProducts -from .ashley_furniture import AshleyFurnitureProducts -from .fanatics import FanaticsProducts -from .carters import CartersProducts -from .american_eagle import AmericanEagleProducts -from .ikea import IkeaProducts -from .hm import HMProducts -from .lego import LegoProducts -from .mattressfirm import MattressfirmProducts -from .crateandbarrel import CrateAndBarrelProducts -from .llbean import LLBeanProducts -from .shein import SheinProducts -from .toysrus import ToysRUsProducts -from .mybobs import MybobsProducts -from .sleepnumber import SleepNumberProducts -from .raymourflanigan import RaymourFlaniganProducts -from .inmuebles24 import Inmuebles24Mexico -from .mouser import MouserProducts -from .zillow import ZillowProperties, ZillowPriceHistory -from .zonaprop import ZonapropArgentina -from .metrocuadrado import MetrocuadradoProperties -from .chileautos import ChileautosChile -from .infocasas import InfocasasUruguay -from .lazboy import LaZBoyProducts -from .properati import ProperatiProperties -from .yapo import YapoChile -from .toctoc import ToctocProperties -from .dior import DiorProducts -from .balenciaga import BalenciagaProducts -from .bottegaveneta import BottegaVenetaProducts -from .olx import OLXBrazil -from .celine import CelineProducts -from .loewe import LoeweProducts -from .berluti import BerlutiProducts -from .moynat import MoynatProducts -from .hermes import HermesProducts -from .delvaux import DelvauxProducts -from .prada import PradaProducts -from .montblanc import MontblancProducts -from .ysl import YSLProducts -from .world_zipcodes import WorldZipcodes -from .pinterest import PinterestPosts, PinterestProfiles -from .shopee import ShopeeProducts -from .lazada import LazadaProducts, LazadaReviews, LazadaProductsSearch -from .youtube import YouTubeProfiles, YouTubeVideos, YouTubeComments -from .digikey import DigikeyProducts -from .facebook import ( - FacebookPagesPosts, - FacebookComments, - FacebookPostsByUrl, - FacebookReels, - FacebookMarketplace, - FacebookCompanyReviews, - FacebookEvents, - FacebookProfiles, - FacebookPagesProfiles, - FacebookGroupPosts, -) -from .x_twitter import XTwitterPosts, XTwitterProfiles -from .reddit import RedditPosts, RedditComments -from .bluesky import BlueskyPosts, BlueskyTopProfiles -from .snapchat import SnapchatPosts -from .quora import QuoraPosts -from .vimeo import VimeoVideos -from .google_news import GoogleNews -from .wikipedia import WikipediaArticles -from .bbc import BBCNews -from .cnn import CNNNews -from .github import GithubRepositories -from .creative_commons import CreativeCommonsImages, CreativeCommons3DModels -from .google_play import GooglePlayStore, GooglePlayReviews -from .apple_appstore import AppleAppStore, AppleAppStoreReviews -from .ebay import EbayProducts -from .etsy import EtsyProducts -from .target import TargetProducts -from .wayfair import WayfairProducts -from .bestbuy import BestBuyProducts -from .myntra import MyntraProducts -from .ozon import OzonProducts -from .wildberries import WildberriesProducts -from .tokopedia import TokopediaProducts -from .google_shopping import GoogleShoppingProducts, GoogleShoppingSearchUS -from .mercadolivre import MercadolivreProducts -from .naver import NaverProducts -from .homedepot import HomeDepotUSProducts, HomeDepotCAProducts -from .lowes import LowesProducts -from .rona import RonaProducts -from .kroger import KrogerProducts -from .macys import MacysProducts -from .costco import CostcoProducts -from .bh import BHProducts -from .microcenter import MicroCenterProducts -from .autozone import AutozoneProducts -from .zoopla import ZooplaProperties -from .booking import BookingListingsSearch, BookingHotelListings -from .realtor import RealtorInternationalProperties -from .agoda import AgodaProperties -from .carsales import CarsalesListings -from .yahoo_finance import YahooFinanceBusinesses if TYPE_CHECKING: - from ..core.async_engine import AsyncEngine + from ..core.engine import AsyncEngine + +# Registry: property_name -> (relative_module, class_name) +# To add a new dataset, just add one line here. +_DATASET_REGISTRY = { + # LinkedIn + "linkedin_profiles": (".linkedin", "LinkedInPeopleProfiles"), + "linkedin_companies": (".linkedin", "LinkedInCompanyProfiles"), + "linkedin_job_listings": (".linkedin", "LinkedInJobListings"), + "linkedin_posts": (".linkedin", "LinkedInPosts"), + "linkedin_profiles_job_listings": (".linkedin", "LinkedInProfilesJobListings"), + # Amazon + "amazon_products": (".amazon", "AmazonProducts"), + "amazon_reviews": (".amazon", "AmazonReviews"), + "amazon_sellers_info": (".amazon", "AmazonSellersInfo"), + "amazon_best_sellers": (".amazon", "AmazonBestSellers"), + "amazon_products_search": (".amazon", "AmazonProductsSearch"), + "amazon_products_global": (".amazon", "AmazonProductsGlobal"), + "amazon_walmart": (".amazon", "AmazonWalmart"), + # Business Data + "crunchbase_companies": (".crunchbase", "CrunchbaseCompanies"), + "zoominfo_companies": (".zoominfo", "ZoomInfoCompanies"), + "pitchbook_companies": (".pitchbook", "PitchBookCompanies"), + "slintel_companies": (".slintel", "SlintelCompanies"), + "owler_companies": (".owler", "OwlerCompanies"), + "ventureradar_companies": (".ventureradar", "VentureRadarCompanies"), + "companies_enriched": (".companies_enriched", "CompaniesEnriched"), + "employees_enriched": (".employees_enriched", "EmployeesEnriched"), + "manta_businesses": (".manta", "MantaBusinesses"), + # Reviews & Ratings + "glassdoor_companies": (".glassdoor", "GlassdoorCompanies"), + "glassdoor_reviews": (".glassdoor", "GlassdoorReviews"), + "glassdoor_jobs": (".glassdoor", "GlassdoorJobs"), + "google_maps_reviews": (".google_maps", "GoogleMapsReviews"), + "google_maps_full_info": (".google_maps", "GoogleMapsFullInfo"), + "yelp_businesses": (".yelp", "YelpBusinesses"), + "yelp_reviews": (".yelp", "YelpReviews"), + "g2_products": (".g2", "G2Products"), + "g2_reviews": (".g2", "G2Reviews"), + "trustpilot_reviews": (".trustpilot", "TrustpilotReviews"), + "trustradius_reviews": (".trustradius", "TrustRadiusReviews"), + # Jobs + "indeed_companies": (".indeed", "IndeedCompanies"), + "indeed_jobs": (".indeed", "IndeedJobs"), + "xing_profiles": (".xing", "XingProfiles"), + "us_lawyers": (".lawyers", "USLawyers"), + # Social Media - Instagram + "instagram_profiles": (".instagram", "InstagramProfiles"), + "instagram_posts": (".instagram", "InstagramPosts"), + "instagram_comments": (".instagram", "InstagramComments"), + "instagram_reels": (".instagram", "InstagramReels"), + # Social Media - TikTok + "tiktok_profiles": (".tiktok", "TikTokProfiles"), + "tiktok_posts": (".tiktok", "TikTokPosts"), + "tiktok_comments": (".tiktok", "TikTokComments"), + "tiktok_shop": (".tiktok", "TikTokShop"), + # Social Media - Facebook + "facebook_pages_posts": (".facebook", "FacebookPagesPosts"), + "facebook_comments": (".facebook", "FacebookComments"), + "facebook_posts_by_url": (".facebook", "FacebookPostsByUrl"), + "facebook_reels": (".facebook", "FacebookReels"), + "facebook_marketplace": (".facebook", "FacebookMarketplace"), + "facebook_company_reviews": (".facebook", "FacebookCompanyReviews"), + "facebook_events": (".facebook", "FacebookEvents"), + "facebook_profiles": (".facebook", "FacebookProfiles"), + "facebook_pages_profiles": (".facebook", "FacebookPagesProfiles"), + "facebook_group_posts": (".facebook", "FacebookGroupPosts"), + # Social Media - X/Twitter + "x_twitter_posts": (".x_twitter", "XTwitterPosts"), + "x_twitter_profiles": (".x_twitter", "XTwitterProfiles"), + # Social Media - Other + "reddit_posts": (".reddit", "RedditPosts"), + "reddit_comments": (".reddit", "RedditComments"), + "bluesky_posts": (".bluesky", "BlueskyPosts"), + "bluesky_top_profiles": (".bluesky", "BlueskyTopProfiles"), + "snapchat_posts": (".snapchat", "SnapchatPosts"), + "quora_posts": (".quora", "QuoraPosts"), + "pinterest_posts": (".pinterest", "PinterestPosts"), + "pinterest_profiles": (".pinterest", "PinterestProfiles"), + # Video + "youtube_profiles": (".youtube", "YouTubeProfiles"), + "youtube_videos": (".youtube", "YouTubeVideos"), + "youtube_comments": (".youtube", "YouTubeComments"), + "vimeo_videos": (".vimeo", "VimeoVideos"), + # News & Content + "google_news": (".google_news", "GoogleNews"), + "wikipedia_articles": (".wikipedia", "WikipediaArticles"), + "bbc_news": (".bbc", "BBCNews"), + "cnn_news": (".cnn", "CNNNews"), + "github_repositories": (".github", "GithubRepositories"), + "creative_commons_images": (".creative_commons", "CreativeCommonsImages"), + "creative_commons_3d_models": (".creative_commons", "CreativeCommons3DModels"), + # App Stores + "google_play_store": (".google_play", "GooglePlayStore"), + "google_play_reviews": (".google_play", "GooglePlayReviews"), + "apple_app_store": (".apple_appstore", "AppleAppStore"), + "apple_app_store_reviews": (".apple_appstore", "AppleAppStoreReviews"), + # E-commerce - General + "walmart_products": (".walmart", "WalmartProducts"), + "walmart_sellers_info": (".walmart", "WalmartSellersInfo"), + "ebay_products": (".ebay", "EbayProducts"), + "etsy_products": (".etsy", "EtsyProducts"), + "target_products": (".target", "TargetProducts"), + "bestbuy_products": (".bestbuy", "BestBuyProducts"), + "costco_products": (".costco", "CostcoProducts"), + "macys_products": (".macys", "MacysProducts"), + "kroger_products": (".kroger", "KrogerProducts"), + "wayfair_products": (".wayfair", "WayfairProducts"), + "shein_products": (".shein", "SheinProducts"), + # E-commerce - Electronics + "digikey_products": (".digikey", "DigikeyProducts"), + "mouser_products": (".mouser", "MouserProducts"), + "microcenter_products": (".microcenter", "MicroCenterProducts"), + "autozone_products": (".autozone", "AutozoneProducts"), + "bh_products": (".bh", "BHProducts"), + "mediamarkt_products": (".mediamarkt", "MediamarktProducts"), + # E-commerce - Fashion + "zalando_products": (".zalando", "ZalandoProducts"), + "sephora_products": (".sephora", "SephoraProducts"), + "zara_products": (".zara", "ZaraProducts"), + "zara_home_products": (".zara", "ZaraHomeProducts"), + "mango_products": (".mango", "MangoProducts"), + "massimo_dutti_products": (".massimo_dutti", "MassimoDuttiProducts"), + "asos_products": (".asos", "AsosProducts"), + "hm_products": (".hm", "HMProducts"), + "american_eagle_products": (".american_eagle", "AmericanEagleProducts"), + "myntra_products": (".myntra", "MyntraProducts"), + # E-commerce - Luxury + "chanel_products": (".chanel", "ChanelProducts"), + "dior_products": (".dior", "DiorProducts"), + "fendi_products": (".fendi", "FendiProducts"), + "balenciaga_products": (".balenciaga", "BalenciagaProducts"), + "bottegaveneta_products": (".bottegaveneta", "BottegaVenetaProducts"), + "celine_products": (".celine", "CelineProducts"), + "loewe_products": (".loewe", "LoeweProducts"), + "berluti_products": (".berluti", "BerlutiProducts"), + "moynat_products": (".moynat", "MoynatProducts"), + "hermes_products": (".hermes", "HermesProducts"), + "delvaux_products": (".delvaux", "DelvauxProducts"), + "prada_products": (".prada", "PradaProducts"), + "montblanc_products": (".montblanc", "MontblancProducts"), + "ysl_products": (".ysl", "YSLProducts"), + # E-commerce - Home & Furniture + "ikea_products": (".ikea", "IkeaProducts"), + "ashley_furniture_products": (".ashley_furniture", "AshleyFurnitureProducts"), + "crateandbarrel_products": (".crateandbarrel", "CrateAndBarrelProducts"), + "lazboy_products": (".lazboy", "LaZBoyProducts"), + "mattressfirm_products": (".mattressfirm", "MattressfirmProducts"), + "sleepnumber_products": (".sleepnumber", "SleepNumberProducts"), + "raymourflanigan_products": (".raymourflanigan", "RaymourFlaniganProducts"), + "mybobs_products": (".mybobs", "MybobsProducts"), + # E-commerce - Other + "fanatics_products": (".fanatics", "FanaticsProducts"), + "carters_products": (".carters", "CartersProducts"), + "lego_products": (".lego", "LegoProducts"), + "llbean_products": (".llbean", "LLBeanProducts"), + "toysrus_products": (".toysrus", "ToysRUsProducts"), + "homedepot_us_products": (".homedepot", "HomeDepotUSProducts"), + "homedepot_ca_products": (".homedepot", "HomeDepotCAProducts"), + "lowes_products": (".lowes", "LowesProducts"), + "rona_products": (".rona", "RonaProducts"), + # E-commerce - International + "shopee_products": (".shopee", "ShopeeProducts"), + "lazada_products": (".lazada", "LazadaProducts"), + "lazada_reviews": (".lazada", "LazadaReviews"), + "lazada_products_search": (".lazada", "LazadaProductsSearch"), + "ozon_products": (".ozon", "OzonProducts"), + "wildberries_products": (".wildberries", "WildberriesProducts"), + "tokopedia_products": (".tokopedia", "TokopediaProducts"), + "mercadolivre_products": (".mercadolivre", "MercadolivreProducts"), + "naver_products": (".naver", "NaverProducts"), + "google_shopping_products": (".google_shopping", "GoogleShoppingProducts"), + "google_shopping_search_us": (".google_shopping", "GoogleShoppingSearchUS"), + # Real Estate + "australia_real_estate": (".real_estate", "AustraliaRealEstate"), + "zillow_properties": (".zillow", "ZillowProperties"), + "zillow_price_history": (".zillow", "ZillowPriceHistory"), + "zoopla_properties": (".zoopla", "ZooplaProperties"), + "otodom_poland": (".otodom", "OtodomPoland"), + "inmuebles24_mexico": (".inmuebles24", "Inmuebles24Mexico"), + "zonaprop_argentina": (".zonaprop", "ZonapropArgentina"), + "metrocuadrado_properties": (".metrocuadrado", "MetrocuadradoProperties"), + "infocasas_uruguay": (".infocasas", "InfocasasUruguay"), + "properati_properties": (".properati", "ProperatiProperties"), + "toctoc_properties": (".toctoc", "ToctocProperties"), + "realtor_international_properties": (".realtor", "RealtorInternationalProperties"), + # Travel + "airbnb_properties": (".airbnb", "AirbnbProperties"), + "booking_listings_search": (".booking", "BookingListingsSearch"), + "booking_hotel_listings": (".booking", "BookingHotelListings"), + "agoda_properties": (".agoda", "AgodaProperties"), + # Automotive + "webmotors_brasil": (".webmotors", "WebmotorsBrasil"), + "chileautos_chile": (".chileautos", "ChileautosChile"), + "carsales_listings": (".carsales", "CarsalesListings"), + # Classifieds + "olx_brazil": (".olx", "OLXBrazil"), + "yapo_chile": (".yapo", "YapoChile"), + # Reference Data + "imdb_movies": (".imdb", "IMDBMovies"), + "nba_players_stats": (".nba", "NBAPlayersStats"), + "goodreads_books": (".goodreads", "GoodreadsBooks"), + "world_population": (".world_population", "WorldPopulation"), + "world_zipcodes": (".world_zipcodes", "WorldZipcodes"), + # Finance + "yahoo_finance_businesses": (".yahoo_finance", "YahooFinanceBusinesses"), +} class DatasetsClient: @@ -190,189 +248,20 @@ class DatasetsClient: def __init__(self, engine: "AsyncEngine"): self._engine = engine - - # Lazy-loaded dataset instances - self._linkedin_profiles: Optional[LinkedInPeopleProfiles] = None - self._linkedin_companies: Optional[LinkedInCompanyProfiles] = None - self._linkedin_job_listings: Optional[LinkedInJobListings] = None - self._amazon_products: Optional[AmazonProducts] = None - self._amazon_reviews: Optional[AmazonReviews] = None - self._crunchbase_companies: Optional[CrunchbaseCompanies] = None - self._imdb_movies: Optional[IMDBMovies] = None - self._nba_players_stats: Optional[NBAPlayersStats] = None - self._goodreads_books: Optional[GoodreadsBooks] = None - self._world_population: Optional[WorldPopulation] = None - self._companies_enriched: Optional[CompaniesEnriched] = None - self._employees_enriched: Optional[EmployeesEnriched] = None - self._glassdoor_companies: Optional[GlassdoorCompanies] = None - self._glassdoor_reviews: Optional[GlassdoorReviews] = None - self._glassdoor_jobs: Optional[GlassdoorJobs] = None - self._google_maps_reviews: Optional[GoogleMapsReviews] = None - self._yelp_businesses: Optional[YelpBusinesses] = None - self._yelp_reviews: Optional[YelpReviews] = None - self._zoominfo_companies: Optional[ZoomInfoCompanies] = None - self._pitchbook_companies: Optional[PitchBookCompanies] = None - self._g2_products: Optional[G2Products] = None - self._g2_reviews: Optional[G2Reviews] = None - self._trustpilot_reviews: Optional[TrustpilotReviews] = None - self._indeed_companies: Optional[IndeedCompanies] = None - self._xing_profiles: Optional[XingProfiles] = None - self._slintel_companies: Optional[SlintelCompanies] = None - self._owler_companies: Optional[OwlerCompanies] = None - self._us_lawyers: Optional[USLawyers] = None - self._manta_businesses: Optional[MantaBusinesses] = None - self._ventureradar_companies: Optional[VentureRadarCompanies] = None - self._trustradius_reviews: Optional[TrustRadiusReviews] = None - self._instagram_profiles: Optional[InstagramProfiles] = None - self._tiktok_profiles: Optional[TikTokProfiles] = None - self._australia_real_estate: Optional[AustraliaRealEstate] = None - self._indeed_jobs: Optional[IndeedJobs] = None - self._walmart_products: Optional[WalmartProducts] = None - self._mediamarkt_products: Optional[MediamarktProducts] = None - self._fendi_products: Optional[FendiProducts] = None - self._zalando_products: Optional[ZalandoProducts] = None - self._sephora_products: Optional[SephoraProducts] = None - self._zara_products: Optional[ZaraProducts] = None - self._zara_home_products: Optional[ZaraHomeProducts] = None - self._mango_products: Optional[MangoProducts] = None - self._massimo_dutti_products: Optional[MassimoDuttiProducts] = None - self._otodom_poland: Optional[OtodomPoland] = None - self._webmotors_brasil: Optional[WebmotorsBrasil] = None - self._airbnb_properties: Optional[AirbnbProperties] = None - self._asos_products: Optional[AsosProducts] = None - self._chanel_products: Optional[ChanelProducts] = None - self._ashley_furniture_products: Optional[AshleyFurnitureProducts] = None - self._fanatics_products: Optional[FanaticsProducts] = None - self._carters_products: Optional[CartersProducts] = None - self._american_eagle_products: Optional[AmericanEagleProducts] = None - self._ikea_products: Optional[IkeaProducts] = None - self._hm_products: Optional[HMProducts] = None - self._lego_products: Optional[LegoProducts] = None - self._mattressfirm_products: Optional[MattressfirmProducts] = None - self._crateandbarrel_products: Optional[CrateAndBarrelProducts] = None - self._llbean_products: Optional[LLBeanProducts] = None - self._shein_products: Optional[SheinProducts] = None - self._toysrus_products: Optional[ToysRUsProducts] = None - self._mybobs_products: Optional[MybobsProducts] = None - self._sleepnumber_products: Optional[SleepNumberProducts] = None - self._raymourflanigan_products: Optional[RaymourFlaniganProducts] = None - self._inmuebles24_mexico: Optional[Inmuebles24Mexico] = None - self._mouser_products: Optional[MouserProducts] = None - self._zillow_properties: Optional[ZillowProperties] = None - self._zonaprop_argentina: Optional[ZonapropArgentina] = None - self._metrocuadrado_properties: Optional[MetrocuadradoProperties] = None - self._chileautos_chile: Optional[ChileautosChile] = None - self._infocasas_uruguay: Optional[InfocasasUruguay] = None - self._lazboy_products: Optional[LaZBoyProducts] = None - self._properati_properties: Optional[ProperatiProperties] = None - self._yapo_chile: Optional[YapoChile] = None - self._toctoc_properties: Optional[ToctocProperties] = None - self._dior_products: Optional[DiorProducts] = None - self._balenciaga_products: Optional[BalenciagaProducts] = None - self._bottegaveneta_products: Optional[BottegaVenetaProducts] = None - self._olx_brazil: Optional[OLXBrazil] = None - self._celine_products: Optional[CelineProducts] = None - self._loewe_products: Optional[LoeweProducts] = None - self._berluti_products: Optional[BerlutiProducts] = None - self._moynat_products: Optional[MoynatProducts] = None - self._hermes_products: Optional[HermesProducts] = None - self._delvaux_products: Optional[DelvauxProducts] = None - self._prada_products: Optional[PradaProducts] = None - self._montblanc_products: Optional[MontblancProducts] = None - self._ysl_products: Optional[YSLProducts] = None - self._amazon_sellers_info: Optional[AmazonSellersInfo] = None - self._world_zipcodes: Optional[WorldZipcodes] = None - self._pinterest_posts: Optional[PinterestPosts] = None - self._pinterest_profiles: Optional[PinterestProfiles] = None - self._shopee_products: Optional[ShopeeProducts] = None - self._lazada_products: Optional[LazadaProducts] = None - self._instagram_posts: Optional[InstagramPosts] = None - self._youtube_profiles: Optional[YouTubeProfiles] = None - self._youtube_videos: Optional[YouTubeVideos] = None - self._youtube_comments: Optional[YouTubeComments] = None - self._digikey_products: Optional[DigikeyProducts] = None - self._facebook_pages_posts: Optional[FacebookPagesPosts] = None - # New datasets - Social Media - self._facebook_comments: Optional[FacebookComments] = None - self._facebook_posts_by_url: Optional[FacebookPostsByUrl] = None - self._facebook_reels: Optional[FacebookReels] = None - self._facebook_marketplace: Optional[FacebookMarketplace] = None - self._facebook_company_reviews: Optional[FacebookCompanyReviews] = None - self._facebook_events: Optional[FacebookEvents] = None - self._facebook_profiles: Optional[FacebookProfiles] = None - self._facebook_pages_profiles: Optional[FacebookPagesProfiles] = None - self._facebook_group_posts: Optional[FacebookGroupPosts] = None - self._tiktok_comments: Optional[TikTokComments] = None - self._tiktok_posts: Optional[TikTokPosts] = None - self._tiktok_shop: Optional[TikTokShop] = None - self._instagram_comments: Optional[InstagramComments] = None - self._instagram_reels: Optional[InstagramReels] = None - self._linkedin_posts: Optional[LinkedInPosts] = None - self._linkedin_profiles_job_listings: Optional[LinkedInProfilesJobListings] = None - self._x_twitter_posts: Optional[XTwitterPosts] = None - self._x_twitter_profiles: Optional[XTwitterProfiles] = None - self._reddit_posts: Optional[RedditPosts] = None - self._reddit_comments: Optional[RedditComments] = None - self._bluesky_posts: Optional[BlueskyPosts] = None - self._bluesky_top_profiles: Optional[BlueskyTopProfiles] = None - self._snapchat_posts: Optional[SnapchatPosts] = None - self._quora_posts: Optional[QuoraPosts] = None - self._vimeo_videos: Optional[VimeoVideos] = None - # New datasets - News/Content - self._google_news: Optional[GoogleNews] = None - self._wikipedia_articles: Optional[WikipediaArticles] = None - self._bbc_news: Optional[BBCNews] = None - self._cnn_news: Optional[CNNNews] = None - self._github_repositories: Optional[GithubRepositories] = None - self._creative_commons_images: Optional[CreativeCommonsImages] = None - self._creative_commons_3d_models: Optional[CreativeCommons3DModels] = None - # New datasets - App Stores - self._google_play_store: Optional[GooglePlayStore] = None - self._google_play_reviews: Optional[GooglePlayReviews] = None - self._apple_app_store: Optional[AppleAppStore] = None - self._apple_app_store_reviews: Optional[AppleAppStoreReviews] = None - # New datasets - E-commerce - self._amazon_best_sellers: Optional[AmazonBestSellers] = None - self._amazon_products_search: Optional[AmazonProductsSearch] = None - self._amazon_products_global: Optional[AmazonProductsGlobal] = None - self._amazon_walmart: Optional[AmazonWalmart] = None - self._walmart_sellers_info: Optional[WalmartSellersInfo] = None - self._ebay_products: Optional[EbayProducts] = None - self._etsy_products: Optional[EtsyProducts] = None - self._target_products: Optional[TargetProducts] = None - self._wayfair_products: Optional[WayfairProducts] = None - self._bestbuy_products: Optional[BestBuyProducts] = None - self._myntra_products: Optional[MyntraProducts] = None - self._ozon_products: Optional[OzonProducts] = None - self._wildberries_products: Optional[WildberriesProducts] = None - self._tokopedia_products: Optional[TokopediaProducts] = None - self._google_shopping_products: Optional[GoogleShoppingProducts] = None - self._google_shopping_search_us: Optional[GoogleShoppingSearchUS] = None - self._mercadolivre_products: Optional[MercadolivreProducts] = None - self._naver_products: Optional[NaverProducts] = None - self._lazada_reviews: Optional[LazadaReviews] = None - self._lazada_products_search: Optional[LazadaProductsSearch] = None - self._homedepot_us_products: Optional[HomeDepotUSProducts] = None - self._homedepot_ca_products: Optional[HomeDepotCAProducts] = None - self._lowes_products: Optional[LowesProducts] = None - self._rona_products: Optional[RonaProducts] = None - self._kroger_products: Optional[KrogerProducts] = None - self._macys_products: Optional[MacysProducts] = None - self._costco_products: Optional[CostcoProducts] = None - self._bh_products: Optional[BHProducts] = None - self._microcenter_products: Optional[MicroCenterProducts] = None - self._autozone_products: Optional[AutozoneProducts] = None - # New datasets - Real Estate/Travel - self._zillow_price_history: Optional[ZillowPriceHistory] = None - self._zoopla_properties: Optional[ZooplaProperties] = None - self._booking_listings_search: Optional[BookingListingsSearch] = None - self._booking_hotel_listings: Optional[BookingHotelListings] = None - self._realtor_international_properties: Optional[RealtorInternationalProperties] = None - self._agoda_properties: Optional[AgodaProperties] = None - self._carsales_listings: Optional[CarsalesListings] = None - # New datasets - Finance/Maps - self._yahoo_finance_businesses: Optional[YahooFinanceBusinesses] = None - self._google_maps_full_info: Optional[GoogleMapsFullInfo] = None + self._cache: dict = {} + + def __getattr__(self, name: str): + if name in _DATASET_REGISTRY: + if name not in self._cache: + module_path, class_name = _DATASET_REGISTRY[name] + module = importlib.import_module(module_path, package=__package__) + cls = getattr(module, class_name) + self._cache[name] = cls(self._engine) + return self._cache[name] + raise AttributeError( + f"'{type(self).__name__}' has no dataset '{name}'. " + f"Available datasets: {', '.join(sorted(_DATASET_REGISTRY))}" + ) async def list(self) -> List[DatasetInfo]: """ @@ -394,1242 +283,3 @@ async def list(self) -> List[DatasetInfo]: ) ) return datasets - - # Dataset properties for IDE autocomplete - - @property - def linkedin_profiles(self) -> LinkedInPeopleProfiles: - """LinkedIn People Profiles dataset (620M+ records).""" - if self._linkedin_profiles is None: - self._linkedin_profiles = LinkedInPeopleProfiles(self._engine) - return self._linkedin_profiles - - @property - def linkedin_companies(self) -> LinkedInCompanyProfiles: - """LinkedIn Company Profiles dataset.""" - if self._linkedin_companies is None: - self._linkedin_companies = LinkedInCompanyProfiles(self._engine) - return self._linkedin_companies - - @property - def linkedin_job_listings(self) -> LinkedInJobListings: - """LinkedIn Profiles Jobs Listings dataset.""" - if self._linkedin_job_listings is None: - self._linkedin_job_listings = LinkedInJobListings(self._engine) - return self._linkedin_job_listings - - @property - def amazon_products(self) -> AmazonProducts: - """Amazon Products dataset.""" - if self._amazon_products is None: - self._amazon_products = AmazonProducts(self._engine) - return self._amazon_products - - @property - def amazon_reviews(self) -> AmazonReviews: - """Amazon Reviews dataset.""" - if self._amazon_reviews is None: - self._amazon_reviews = AmazonReviews(self._engine) - return self._amazon_reviews - - @property - def crunchbase_companies(self) -> CrunchbaseCompanies: - """Crunchbase Companies dataset (2.3M+ records).""" - if self._crunchbase_companies is None: - self._crunchbase_companies = CrunchbaseCompanies(self._engine) - return self._crunchbase_companies - - @property - def imdb_movies(self) -> IMDBMovies: - """IMDB Movies dataset (867K+ records).""" - if self._imdb_movies is None: - self._imdb_movies = IMDBMovies(self._engine) - return self._imdb_movies - - @property - def nba_players_stats(self) -> NBAPlayersStats: - """NBA Players Stats dataset (17K+ records).""" - if self._nba_players_stats is None: - self._nba_players_stats = NBAPlayersStats(self._engine) - return self._nba_players_stats - - @property - def goodreads_books(self) -> GoodreadsBooks: - """Goodreads Books dataset.""" - if self._goodreads_books is None: - self._goodreads_books = GoodreadsBooks(self._engine) - return self._goodreads_books - - @property - def world_population(self) -> WorldPopulation: - """World Population dataset.""" - if self._world_population is None: - self._world_population = WorldPopulation(self._engine) - return self._world_population - - @property - def companies_enriched(self) -> CompaniesEnriched: - """Companies Enriched dataset - multi-source company information.""" - if self._companies_enriched is None: - self._companies_enriched = CompaniesEnriched(self._engine) - return self._companies_enriched - - @property - def employees_enriched(self) -> EmployeesEnriched: - """Employees Business Enriched dataset - LinkedIn profiles with company data.""" - if self._employees_enriched is None: - self._employees_enriched = EmployeesEnriched(self._engine) - return self._employees_enriched - - @property - def glassdoor_companies(self) -> GlassdoorCompanies: - """Glassdoor Companies Overview dataset - ratings, reviews, and company details.""" - if self._glassdoor_companies is None: - self._glassdoor_companies = GlassdoorCompanies(self._engine) - return self._glassdoor_companies - - @property - def glassdoor_reviews(self) -> GlassdoorReviews: - """Glassdoor Companies Reviews dataset - employee reviews and ratings.""" - if self._glassdoor_reviews is None: - self._glassdoor_reviews = GlassdoorReviews(self._engine) - return self._glassdoor_reviews - - @property - def glassdoor_jobs(self) -> GlassdoorJobs: - """Glassdoor Job Listings dataset - job postings with company data.""" - if self._glassdoor_jobs is None: - self._glassdoor_jobs = GlassdoorJobs(self._engine) - return self._glassdoor_jobs - - @property - def google_maps_reviews(self) -> GoogleMapsReviews: - """Google Maps Reviews dataset - place reviews and ratings.""" - if self._google_maps_reviews is None: - self._google_maps_reviews = GoogleMapsReviews(self._engine) - return self._google_maps_reviews - - @property - def yelp_businesses(self) -> YelpBusinesses: - """Yelp Businesses Overview dataset - business listings and ratings.""" - if self._yelp_businesses is None: - self._yelp_businesses = YelpBusinesses(self._engine) - return self._yelp_businesses - - @property - def yelp_reviews(self) -> YelpReviews: - """Yelp Business Reviews dataset - individual business reviews.""" - if self._yelp_reviews is None: - self._yelp_reviews = YelpReviews(self._engine) - return self._yelp_reviews - - @property - def zoominfo_companies(self) -> ZoomInfoCompanies: - """ZoomInfo Companies dataset - company data with financials and contacts.""" - if self._zoominfo_companies is None: - self._zoominfo_companies = ZoomInfoCompanies(self._engine) - return self._zoominfo_companies - - @property - def pitchbook_companies(self) -> PitchBookCompanies: - """PitchBook Companies dataset - PE/VC company data with deals.""" - if self._pitchbook_companies is None: - self._pitchbook_companies = PitchBookCompanies(self._engine) - return self._pitchbook_companies - - @property - def g2_products(self) -> G2Products: - """G2 Software Product Overview dataset - software ratings and reviews.""" - if self._g2_products is None: - self._g2_products = G2Products(self._engine) - return self._g2_products - - @property - def g2_reviews(self) -> G2Reviews: - """G2 Software Product Reviews dataset - individual product reviews.""" - if self._g2_reviews is None: - self._g2_reviews = G2Reviews(self._engine) - return self._g2_reviews - - @property - def trustpilot_reviews(self) -> TrustpilotReviews: - """Trustpilot Business Reviews dataset - company reviews and ratings.""" - if self._trustpilot_reviews is None: - self._trustpilot_reviews = TrustpilotReviews(self._engine) - return self._trustpilot_reviews - - @property - def indeed_companies(self) -> IndeedCompanies: - """Indeed Companies Info dataset - company profiles with jobs and reviews.""" - if self._indeed_companies is None: - self._indeed_companies = IndeedCompanies(self._engine) - return self._indeed_companies - - @property - def xing_profiles(self) -> XingProfiles: - """Xing Social Network Profiles dataset - professional profiles.""" - if self._xing_profiles is None: - self._xing_profiles = XingProfiles(self._engine) - return self._xing_profiles - - @property - def slintel_companies(self) -> SlintelCompanies: - """Slintel 6sense Company Information dataset - technographics and company data.""" - if self._slintel_companies is None: - self._slintel_companies = SlintelCompanies(self._engine) - return self._slintel_companies - - @property - def owler_companies(self) -> OwlerCompanies: - """Owler Companies Information dataset - competitive intelligence and metrics.""" - if self._owler_companies is None: - self._owler_companies = OwlerCompanies(self._engine) - return self._owler_companies - - @property - def us_lawyers(self) -> USLawyers: - """US Lawyers Directory dataset - lawyer profiles and practice areas.""" - if self._us_lawyers is None: - self._us_lawyers = USLawyers(self._engine) - return self._us_lawyers - - @property - def manta_businesses(self) -> MantaBusinesses: - """Manta Businesses dataset - business listings with revenue and employees.""" - if self._manta_businesses is None: - self._manta_businesses = MantaBusinesses(self._engine) - return self._manta_businesses - - @property - def ventureradar_companies(self) -> VentureRadarCompanies: - """VentureRadar Company Information dataset - startup intelligence.""" - if self._ventureradar_companies is None: - self._ventureradar_companies = VentureRadarCompanies(self._engine) - return self._ventureradar_companies - - @property - def trustradius_reviews(self) -> TrustRadiusReviews: - """TrustRadius Product Reviews dataset - software product reviews.""" - if self._trustradius_reviews is None: - self._trustradius_reviews = TrustRadiusReviews(self._engine) - return self._trustradius_reviews - - @property - def instagram_profiles(self) -> InstagramProfiles: - """Instagram Profiles dataset - user profiles and engagement.""" - if self._instagram_profiles is None: - self._instagram_profiles = InstagramProfiles(self._engine) - return self._instagram_profiles - - @property - def tiktok_profiles(self) -> TikTokProfiles: - """TikTok Profiles dataset - user profiles and engagement.""" - if self._tiktok_profiles is None: - self._tiktok_profiles = TikTokProfiles(self._engine) - return self._tiktok_profiles - - @property - def australia_real_estate(self) -> AustraliaRealEstate: - """Australia Real Estate Properties dataset.""" - if self._australia_real_estate is None: - self._australia_real_estate = AustraliaRealEstate(self._engine) - return self._australia_real_estate - - @property - def indeed_jobs(self) -> IndeedJobs: - """Indeed Job Listings dataset.""" - if self._indeed_jobs is None: - self._indeed_jobs = IndeedJobs(self._engine) - return self._indeed_jobs - - @property - def walmart_products(self) -> WalmartProducts: - """Walmart Products dataset.""" - if self._walmart_products is None: - self._walmart_products = WalmartProducts(self._engine) - return self._walmart_products - - @property - def mediamarkt_products(self) -> MediamarktProducts: - """Mediamarkt.de Products dataset.""" - if self._mediamarkt_products is None: - self._mediamarkt_products = MediamarktProducts(self._engine) - return self._mediamarkt_products - - @property - def fendi_products(self) -> FendiProducts: - """Fendi Products dataset.""" - if self._fendi_products is None: - self._fendi_products = FendiProducts(self._engine) - return self._fendi_products - - @property - def zalando_products(self) -> ZalandoProducts: - """Zalando Products dataset.""" - if self._zalando_products is None: - self._zalando_products = ZalandoProducts(self._engine) - return self._zalando_products - - @property - def sephora_products(self) -> SephoraProducts: - """Sephora Products dataset.""" - if self._sephora_products is None: - self._sephora_products = SephoraProducts(self._engine) - return self._sephora_products - - @property - def zara_products(self) -> ZaraProducts: - """Zara Products dataset.""" - if self._zara_products is None: - self._zara_products = ZaraProducts(self._engine) - return self._zara_products - - @property - def zara_home_products(self) -> ZaraHomeProducts: - """Zara Home Products dataset.""" - if self._zara_home_products is None: - self._zara_home_products = ZaraHomeProducts(self._engine) - return self._zara_home_products - - @property - def mango_products(self) -> MangoProducts: - """Mango Products dataset.""" - if self._mango_products is None: - self._mango_products = MangoProducts(self._engine) - return self._mango_products - - @property - def massimo_dutti_products(self) -> MassimoDuttiProducts: - """Massimo Dutti Products dataset.""" - if self._massimo_dutti_products is None: - self._massimo_dutti_products = MassimoDuttiProducts(self._engine) - return self._massimo_dutti_products - - @property - def otodom_poland(self) -> OtodomPoland: - """Otodom Poland real estate dataset.""" - if self._otodom_poland is None: - self._otodom_poland = OtodomPoland(self._engine) - return self._otodom_poland - - @property - def webmotors_brasil(self) -> WebmotorsBrasil: - """Webmotors Brasil vehicle listings dataset.""" - if self._webmotors_brasil is None: - self._webmotors_brasil = WebmotorsBrasil(self._engine) - return self._webmotors_brasil - - @property - def airbnb_properties(self) -> AirbnbProperties: - """Airbnb Properties dataset.""" - if self._airbnb_properties is None: - self._airbnb_properties = AirbnbProperties(self._engine) - return self._airbnb_properties - - @property - def asos_products(self) -> AsosProducts: - """Asos Products dataset.""" - if self._asos_products is None: - self._asos_products = AsosProducts(self._engine) - return self._asos_products - - @property - def chanel_products(self) -> ChanelProducts: - """Chanel Products dataset.""" - if self._chanel_products is None: - self._chanel_products = ChanelProducts(self._engine) - return self._chanel_products - - @property - def ashley_furniture_products(self) -> AshleyFurnitureProducts: - """Ashley Furniture Products dataset.""" - if self._ashley_furniture_products is None: - self._ashley_furniture_products = AshleyFurnitureProducts(self._engine) - return self._ashley_furniture_products - - @property - def fanatics_products(self) -> FanaticsProducts: - """Fanatics Products dataset.""" - if self._fanatics_products is None: - self._fanatics_products = FanaticsProducts(self._engine) - return self._fanatics_products - - @property - def carters_products(self) -> CartersProducts: - """Carters Products dataset.""" - if self._carters_products is None: - self._carters_products = CartersProducts(self._engine) - return self._carters_products - - @property - def american_eagle_products(self) -> AmericanEagleProducts: - """American Eagle Products dataset.""" - if self._american_eagle_products is None: - self._american_eagle_products = AmericanEagleProducts(self._engine) - return self._american_eagle_products - - @property - def ikea_products(self) -> IkeaProducts: - """Ikea Products dataset.""" - if self._ikea_products is None: - self._ikea_products = IkeaProducts(self._engine) - return self._ikea_products - - @property - def hm_products(self) -> HMProducts: - """H&M Products dataset.""" - if self._hm_products is None: - self._hm_products = HMProducts(self._engine) - return self._hm_products - - @property - def lego_products(self) -> LegoProducts: - """Lego Products dataset.""" - if self._lego_products is None: - self._lego_products = LegoProducts(self._engine) - return self._lego_products - - @property - def mattressfirm_products(self) -> MattressfirmProducts: - """Mattressfirm Products dataset.""" - if self._mattressfirm_products is None: - self._mattressfirm_products = MattressfirmProducts(self._engine) - return self._mattressfirm_products - - @property - def crateandbarrel_products(self) -> CrateAndBarrelProducts: - """Crate and Barrel Products dataset.""" - if self._crateandbarrel_products is None: - self._crateandbarrel_products = CrateAndBarrelProducts(self._engine) - return self._crateandbarrel_products - - @property - def llbean_products(self) -> LLBeanProducts: - """L.L. Bean Products dataset.""" - if self._llbean_products is None: - self._llbean_products = LLBeanProducts(self._engine) - return self._llbean_products - - @property - def shein_products(self) -> SheinProducts: - """Shein Products dataset.""" - if self._shein_products is None: - self._shein_products = SheinProducts(self._engine) - return self._shein_products - - @property - def toysrus_products(self) -> ToysRUsProducts: - """Toys R Us Products dataset.""" - if self._toysrus_products is None: - self._toysrus_products = ToysRUsProducts(self._engine) - return self._toysrus_products - - @property - def mybobs_products(self) -> MybobsProducts: - """Mybobs Products dataset.""" - if self._mybobs_products is None: - self._mybobs_products = MybobsProducts(self._engine) - return self._mybobs_products - - @property - def sleepnumber_products(self) -> SleepNumberProducts: - """Sleep Number Products dataset.""" - if self._sleepnumber_products is None: - self._sleepnumber_products = SleepNumberProducts(self._engine) - return self._sleepnumber_products - - @property - def raymourflanigan_products(self) -> RaymourFlaniganProducts: - """Raymour and Flanigan Products dataset.""" - if self._raymourflanigan_products is None: - self._raymourflanigan_products = RaymourFlaniganProducts(self._engine) - return self._raymourflanigan_products - - @property - def inmuebles24_mexico(self) -> Inmuebles24Mexico: - """Inmuebles24 Mexico real estate dataset.""" - if self._inmuebles24_mexico is None: - self._inmuebles24_mexico = Inmuebles24Mexico(self._engine) - return self._inmuebles24_mexico - - @property - def mouser_products(self) -> MouserProducts: - """Mouser Products dataset.""" - if self._mouser_products is None: - self._mouser_products = MouserProducts(self._engine) - return self._mouser_products - - @property - def zillow_properties(self) -> ZillowProperties: - """Zillow Properties dataset.""" - if self._zillow_properties is None: - self._zillow_properties = ZillowProperties(self._engine) - return self._zillow_properties - - @property - def zonaprop_argentina(self) -> ZonapropArgentina: - """Zonaprop Argentina real estate dataset.""" - if self._zonaprop_argentina is None: - self._zonaprop_argentina = ZonapropArgentina(self._engine) - return self._zonaprop_argentina - - @property - def metrocuadrado_properties(self) -> MetrocuadradoProperties: - """Metrocuadrado Properties dataset.""" - if self._metrocuadrado_properties is None: - self._metrocuadrado_properties = MetrocuadradoProperties(self._engine) - return self._metrocuadrado_properties - - @property - def chileautos_chile(self) -> ChileautosChile: - """Chileautos Chile car listings dataset.""" - if self._chileautos_chile is None: - self._chileautos_chile = ChileautosChile(self._engine) - return self._chileautos_chile - - @property - def infocasas_uruguay(self) -> InfocasasUruguay: - """Infocasas Uruguay real estate dataset.""" - if self._infocasas_uruguay is None: - self._infocasas_uruguay = InfocasasUruguay(self._engine) - return self._infocasas_uruguay - - @property - def lazboy_products(self) -> LaZBoyProducts: - """La-Z-Boy Products dataset.""" - if self._lazboy_products is None: - self._lazboy_products = LaZBoyProducts(self._engine) - return self._lazboy_products - - @property - def properati_properties(self) -> ProperatiProperties: - """Properati Properties dataset.""" - if self._properati_properties is None: - self._properati_properties = ProperatiProperties(self._engine) - return self._properati_properties - - @property - def yapo_chile(self) -> YapoChile: - """Yapo Chile marketplace ads dataset.""" - if self._yapo_chile is None: - self._yapo_chile = YapoChile(self._engine) - return self._yapo_chile - - @property - def toctoc_properties(self) -> ToctocProperties: - """Toctoc Properties dataset.""" - if self._toctoc_properties is None: - self._toctoc_properties = ToctocProperties(self._engine) - return self._toctoc_properties - - @property - def dior_products(self) -> DiorProducts: - """Dior Products dataset.""" - if self._dior_products is None: - self._dior_products = DiorProducts(self._engine) - return self._dior_products - - @property - def balenciaga_products(self) -> BalenciagaProducts: - """Balenciaga Products dataset.""" - if self._balenciaga_products is None: - self._balenciaga_products = BalenciagaProducts(self._engine) - return self._balenciaga_products - - @property - def bottegaveneta_products(self) -> BottegaVenetaProducts: - """Bottega Veneta Products dataset.""" - if self._bottegaveneta_products is None: - self._bottegaveneta_products = BottegaVenetaProducts(self._engine) - return self._bottegaveneta_products - - @property - def olx_brazil(self) -> OLXBrazil: - """OLX Brazil marketplace ads dataset.""" - if self._olx_brazil is None: - self._olx_brazil = OLXBrazil(self._engine) - return self._olx_brazil - - @property - def celine_products(self) -> CelineProducts: - """Celine Products dataset.""" - if self._celine_products is None: - self._celine_products = CelineProducts(self._engine) - return self._celine_products - - @property - def loewe_products(self) -> LoeweProducts: - """Loewe Products dataset.""" - if self._loewe_products is None: - self._loewe_products = LoeweProducts(self._engine) - return self._loewe_products - - @property - def berluti_products(self) -> BerlutiProducts: - """Berluti Products dataset.""" - if self._berluti_products is None: - self._berluti_products = BerlutiProducts(self._engine) - return self._berluti_products - - @property - def moynat_products(self) -> MoynatProducts: - """Moynat Products dataset.""" - if self._moynat_products is None: - self._moynat_products = MoynatProducts(self._engine) - return self._moynat_products - - @property - def hermes_products(self) -> HermesProducts: - """Hermes Products dataset.""" - if self._hermes_products is None: - self._hermes_products = HermesProducts(self._engine) - return self._hermes_products - - @property - def delvaux_products(self) -> DelvauxProducts: - """Delvaux Products dataset.""" - if self._delvaux_products is None: - self._delvaux_products = DelvauxProducts(self._engine) - return self._delvaux_products - - @property - def prada_products(self) -> PradaProducts: - """Prada Products dataset.""" - if self._prada_products is None: - self._prada_products = PradaProducts(self._engine) - return self._prada_products - - @property - def montblanc_products(self) -> MontblancProducts: - """Montblanc Products dataset.""" - if self._montblanc_products is None: - self._montblanc_products = MontblancProducts(self._engine) - return self._montblanc_products - - @property - def ysl_products(self) -> YSLProducts: - """YSL Products dataset.""" - if self._ysl_products is None: - self._ysl_products = YSLProducts(self._engine) - return self._ysl_products - - @property - def amazon_sellers_info(self) -> AmazonSellersInfo: - """Amazon Sellers Info dataset.""" - if self._amazon_sellers_info is None: - self._amazon_sellers_info = AmazonSellersInfo(self._engine) - return self._amazon_sellers_info - - @property - def world_zipcodes(self) -> WorldZipcodes: - """World Zipcodes dataset.""" - if self._world_zipcodes is None: - self._world_zipcodes = WorldZipcodes(self._engine) - return self._world_zipcodes - - @property - def pinterest_posts(self) -> PinterestPosts: - """Pinterest Posts dataset.""" - if self._pinterest_posts is None: - self._pinterest_posts = PinterestPosts(self._engine) - return self._pinterest_posts - - @property - def pinterest_profiles(self) -> PinterestProfiles: - """Pinterest Profiles dataset.""" - if self._pinterest_profiles is None: - self._pinterest_profiles = PinterestProfiles(self._engine) - return self._pinterest_profiles - - @property - def shopee_products(self) -> ShopeeProducts: - """Shopee Products dataset.""" - if self._shopee_products is None: - self._shopee_products = ShopeeProducts(self._engine) - return self._shopee_products - - @property - def lazada_products(self) -> LazadaProducts: - """Lazada Products dataset.""" - if self._lazada_products is None: - self._lazada_products = LazadaProducts(self._engine) - return self._lazada_products - - @property - def instagram_posts(self) -> InstagramPosts: - """Instagram Posts dataset.""" - if self._instagram_posts is None: - self._instagram_posts = InstagramPosts(self._engine) - return self._instagram_posts - - @property - def youtube_profiles(self) -> YouTubeProfiles: - """YouTube Profiles dataset.""" - if self._youtube_profiles is None: - self._youtube_profiles = YouTubeProfiles(self._engine) - return self._youtube_profiles - - @property - def youtube_videos(self) -> YouTubeVideos: - """YouTube Videos dataset.""" - if self._youtube_videos is None: - self._youtube_videos = YouTubeVideos(self._engine) - return self._youtube_videos - - @property - def youtube_comments(self) -> YouTubeComments: - """YouTube Comments dataset.""" - if self._youtube_comments is None: - self._youtube_comments = YouTubeComments(self._engine) - return self._youtube_comments - - @property - def digikey_products(self) -> DigikeyProducts: - """Digikey Products dataset.""" - if self._digikey_products is None: - self._digikey_products = DigikeyProducts(self._engine) - return self._digikey_products - - @property - def facebook_pages_posts(self) -> FacebookPagesPosts: - """Facebook Pages Posts dataset.""" - if self._facebook_pages_posts is None: - self._facebook_pages_posts = FacebookPagesPosts(self._engine) - return self._facebook_pages_posts - - # --- New dataset properties - Social Media --- - - @property - def facebook_comments(self) -> FacebookComments: - """Facebook Comments dataset.""" - if self._facebook_comments is None: - self._facebook_comments = FacebookComments(self._engine) - return self._facebook_comments - - @property - def facebook_posts_by_url(self) -> FacebookPostsByUrl: - """Facebook Posts by URL dataset.""" - if self._facebook_posts_by_url is None: - self._facebook_posts_by_url = FacebookPostsByUrl(self._engine) - return self._facebook_posts_by_url - - @property - def facebook_reels(self) -> FacebookReels: - """Facebook Reels dataset.""" - if self._facebook_reels is None: - self._facebook_reels = FacebookReels(self._engine) - return self._facebook_reels - - @property - def facebook_marketplace(self) -> FacebookMarketplace: - """Facebook Marketplace dataset.""" - if self._facebook_marketplace is None: - self._facebook_marketplace = FacebookMarketplace(self._engine) - return self._facebook_marketplace - - @property - def facebook_company_reviews(self) -> FacebookCompanyReviews: - """Facebook Company Reviews dataset.""" - if self._facebook_company_reviews is None: - self._facebook_company_reviews = FacebookCompanyReviews(self._engine) - return self._facebook_company_reviews - - @property - def facebook_events(self) -> FacebookEvents: - """Facebook Events dataset.""" - if self._facebook_events is None: - self._facebook_events = FacebookEvents(self._engine) - return self._facebook_events - - @property - def facebook_profiles(self) -> FacebookProfiles: - """Facebook Profiles dataset.""" - if self._facebook_profiles is None: - self._facebook_profiles = FacebookProfiles(self._engine) - return self._facebook_profiles - - @property - def facebook_pages_profiles(self) -> FacebookPagesProfiles: - """Facebook Pages and Profiles dataset.""" - if self._facebook_pages_profiles is None: - self._facebook_pages_profiles = FacebookPagesProfiles(self._engine) - return self._facebook_pages_profiles - - @property - def facebook_group_posts(self) -> FacebookGroupPosts: - """Facebook Group Posts dataset.""" - if self._facebook_group_posts is None: - self._facebook_group_posts = FacebookGroupPosts(self._engine) - return self._facebook_group_posts - - @property - def tiktok_comments(self) -> TikTokComments: - """TikTok Comments dataset.""" - if self._tiktok_comments is None: - self._tiktok_comments = TikTokComments(self._engine) - return self._tiktok_comments - - @property - def tiktok_posts(self) -> TikTokPosts: - """TikTok Posts dataset.""" - if self._tiktok_posts is None: - self._tiktok_posts = TikTokPosts(self._engine) - return self._tiktok_posts - - @property - def tiktok_shop(self) -> TikTokShop: - """TikTok Shop dataset.""" - if self._tiktok_shop is None: - self._tiktok_shop = TikTokShop(self._engine) - return self._tiktok_shop - - @property - def instagram_comments(self) -> InstagramComments: - """Instagram Comments dataset.""" - if self._instagram_comments is None: - self._instagram_comments = InstagramComments(self._engine) - return self._instagram_comments - - @property - def instagram_reels(self) -> InstagramReels: - """Instagram Reels dataset.""" - if self._instagram_reels is None: - self._instagram_reels = InstagramReels(self._engine) - return self._instagram_reels - - @property - def linkedin_posts(self) -> LinkedInPosts: - """LinkedIn Posts dataset.""" - if self._linkedin_posts is None: - self._linkedin_posts = LinkedInPosts(self._engine) - return self._linkedin_posts - - @property - def linkedin_profiles_job_listings(self) -> LinkedInProfilesJobListings: - """LinkedIn Profiles Job Listings dataset.""" - if self._linkedin_profiles_job_listings is None: - self._linkedin_profiles_job_listings = LinkedInProfilesJobListings(self._engine) - return self._linkedin_profiles_job_listings - - @property - def x_twitter_posts(self) -> XTwitterPosts: - """X (Twitter) Posts dataset.""" - if self._x_twitter_posts is None: - self._x_twitter_posts = XTwitterPosts(self._engine) - return self._x_twitter_posts - - @property - def x_twitter_profiles(self) -> XTwitterProfiles: - """X (Twitter) Profiles dataset.""" - if self._x_twitter_profiles is None: - self._x_twitter_profiles = XTwitterProfiles(self._engine) - return self._x_twitter_profiles - - @property - def reddit_posts(self) -> RedditPosts: - """Reddit Posts dataset.""" - if self._reddit_posts is None: - self._reddit_posts = RedditPosts(self._engine) - return self._reddit_posts - - @property - def reddit_comments(self) -> RedditComments: - """Reddit Comments dataset.""" - if self._reddit_comments is None: - self._reddit_comments = RedditComments(self._engine) - return self._reddit_comments - - @property - def bluesky_posts(self) -> BlueskyPosts: - """Bluesky Posts dataset.""" - if self._bluesky_posts is None: - self._bluesky_posts = BlueskyPosts(self._engine) - return self._bluesky_posts - - @property - def bluesky_top_profiles(self) -> BlueskyTopProfiles: - """Top 500 Bluesky Profiles dataset.""" - if self._bluesky_top_profiles is None: - self._bluesky_top_profiles = BlueskyTopProfiles(self._engine) - return self._bluesky_top_profiles - - @property - def snapchat_posts(self) -> SnapchatPosts: - """Snapchat Posts dataset.""" - if self._snapchat_posts is None: - self._snapchat_posts = SnapchatPosts(self._engine) - return self._snapchat_posts - - @property - def quora_posts(self) -> QuoraPosts: - """Quora Posts dataset.""" - if self._quora_posts is None: - self._quora_posts = QuoraPosts(self._engine) - return self._quora_posts - - @property - def vimeo_videos(self) -> VimeoVideos: - """Vimeo Videos dataset.""" - if self._vimeo_videos is None: - self._vimeo_videos = VimeoVideos(self._engine) - return self._vimeo_videos - - # --- New dataset properties - News/Content --- - - @property - def google_news(self) -> GoogleNews: - """Google News dataset.""" - if self._google_news is None: - self._google_news = GoogleNews(self._engine) - return self._google_news - - @property - def wikipedia_articles(self) -> WikipediaArticles: - """Wikipedia Articles dataset.""" - if self._wikipedia_articles is None: - self._wikipedia_articles = WikipediaArticles(self._engine) - return self._wikipedia_articles - - @property - def bbc_news(self) -> BBCNews: - """BBC News dataset.""" - if self._bbc_news is None: - self._bbc_news = BBCNews(self._engine) - return self._bbc_news - - @property - def cnn_news(self) -> CNNNews: - """CNN News dataset.""" - if self._cnn_news is None: - self._cnn_news = CNNNews(self._engine) - return self._cnn_news - - @property - def github_repositories(self) -> GithubRepositories: - """GitHub Repositories dataset.""" - if self._github_repositories is None: - self._github_repositories = GithubRepositories(self._engine) - return self._github_repositories - - @property - def creative_commons_images(self) -> CreativeCommonsImages: - """Creative Commons Images dataset.""" - if self._creative_commons_images is None: - self._creative_commons_images = CreativeCommonsImages(self._engine) - return self._creative_commons_images - - @property - def creative_commons_3d_models(self) -> CreativeCommons3DModels: - """Creative Commons 3D Models dataset.""" - if self._creative_commons_3d_models is None: - self._creative_commons_3d_models = CreativeCommons3DModels(self._engine) - return self._creative_commons_3d_models - - # --- New dataset properties - App Stores --- - - @property - def google_play_store(self) -> GooglePlayStore: - """Google Play Store dataset.""" - if self._google_play_store is None: - self._google_play_store = GooglePlayStore(self._engine) - return self._google_play_store - - @property - def google_play_reviews(self) -> GooglePlayReviews: - """Google Play Store Reviews dataset.""" - if self._google_play_reviews is None: - self._google_play_reviews = GooglePlayReviews(self._engine) - return self._google_play_reviews - - @property - def apple_app_store(self) -> AppleAppStore: - """Apple App Store dataset.""" - if self._apple_app_store is None: - self._apple_app_store = AppleAppStore(self._engine) - return self._apple_app_store - - @property - def apple_app_store_reviews(self) -> AppleAppStoreReviews: - """Apple App Store Reviews dataset.""" - if self._apple_app_store_reviews is None: - self._apple_app_store_reviews = AppleAppStoreReviews(self._engine) - return self._apple_app_store_reviews - - # --- New dataset properties - E-commerce --- - - @property - def amazon_best_sellers(self) -> AmazonBestSellers: - """Amazon Best Sellers dataset.""" - if self._amazon_best_sellers is None: - self._amazon_best_sellers = AmazonBestSellers(self._engine) - return self._amazon_best_sellers - - @property - def amazon_products_search(self) -> AmazonProductsSearch: - """Amazon Products Search dataset.""" - if self._amazon_products_search is None: - self._amazon_products_search = AmazonProductsSearch(self._engine) - return self._amazon_products_search - - @property - def amazon_products_global(self) -> AmazonProductsGlobal: - """Amazon Products Global dataset.""" - if self._amazon_products_global is None: - self._amazon_products_global = AmazonProductsGlobal(self._engine) - return self._amazon_products_global - - @property - def amazon_walmart(self) -> AmazonWalmart: - """Amazon Walmart dataset.""" - if self._amazon_walmart is None: - self._amazon_walmart = AmazonWalmart(self._engine) - return self._amazon_walmart - - @property - def walmart_sellers_info(self) -> WalmartSellersInfo: - """Walmart Sellers Info dataset.""" - if self._walmart_sellers_info is None: - self._walmart_sellers_info = WalmartSellersInfo(self._engine) - return self._walmart_sellers_info - - @property - def ebay_products(self) -> EbayProducts: - """eBay Products dataset.""" - if self._ebay_products is None: - self._ebay_products = EbayProducts(self._engine) - return self._ebay_products - - @property - def etsy_products(self) -> EtsyProducts: - """Etsy Products dataset.""" - if self._etsy_products is None: - self._etsy_products = EtsyProducts(self._engine) - return self._etsy_products - - @property - def target_products(self) -> TargetProducts: - """Target Products dataset.""" - if self._target_products is None: - self._target_products = TargetProducts(self._engine) - return self._target_products - - @property - def wayfair_products(self) -> WayfairProducts: - """Wayfair Products dataset.""" - if self._wayfair_products is None: - self._wayfair_products = WayfairProducts(self._engine) - return self._wayfair_products - - @property - def bestbuy_products(self) -> BestBuyProducts: - """Best Buy Products dataset.""" - if self._bestbuy_products is None: - self._bestbuy_products = BestBuyProducts(self._engine) - return self._bestbuy_products - - @property - def myntra_products(self) -> MyntraProducts: - """Myntra Products dataset.""" - if self._myntra_products is None: - self._myntra_products = MyntraProducts(self._engine) - return self._myntra_products - - @property - def ozon_products(self) -> OzonProducts: - """Ozon.ru Products dataset.""" - if self._ozon_products is None: - self._ozon_products = OzonProducts(self._engine) - return self._ozon_products - - @property - def wildberries_products(self) -> WildberriesProducts: - """Wildberries.ru Products dataset.""" - if self._wildberries_products is None: - self._wildberries_products = WildberriesProducts(self._engine) - return self._wildberries_products - - @property - def tokopedia_products(self) -> TokopediaProducts: - """Tokopedia Products dataset.""" - if self._tokopedia_products is None: - self._tokopedia_products = TokopediaProducts(self._engine) - return self._tokopedia_products - - @property - def google_shopping_products(self) -> GoogleShoppingProducts: - """Google Shopping Products dataset.""" - if self._google_shopping_products is None: - self._google_shopping_products = GoogleShoppingProducts(self._engine) - return self._google_shopping_products - - @property - def google_shopping_search_us(self) -> GoogleShoppingSearchUS: - """Google Shopping Search US dataset.""" - if self._google_shopping_search_us is None: - self._google_shopping_search_us = GoogleShoppingSearchUS(self._engine) - return self._google_shopping_search_us - - @property - def mercadolivre_products(self) -> MercadolivreProducts: - """MercadoLivre Products dataset.""" - if self._mercadolivre_products is None: - self._mercadolivre_products = MercadolivreProducts(self._engine) - return self._mercadolivre_products - - @property - def naver_products(self) -> NaverProducts: - """Naver Products dataset.""" - if self._naver_products is None: - self._naver_products = NaverProducts(self._engine) - return self._naver_products - - @property - def lazada_reviews(self) -> LazadaReviews: - """Lazada Reviews dataset.""" - if self._lazada_reviews is None: - self._lazada_reviews = LazadaReviews(self._engine) - return self._lazada_reviews - - @property - def lazada_products_search(self) -> LazadaProductsSearch: - """Lazada Products Search dataset.""" - if self._lazada_products_search is None: - self._lazada_products_search = LazadaProductsSearch(self._engine) - return self._lazada_products_search - - @property - def homedepot_us_products(self) -> HomeDepotUSProducts: - """Home Depot US Products dataset.""" - if self._homedepot_us_products is None: - self._homedepot_us_products = HomeDepotUSProducts(self._engine) - return self._homedepot_us_products - - @property - def homedepot_ca_products(self) -> HomeDepotCAProducts: - """Home Depot Canada Products dataset.""" - if self._homedepot_ca_products is None: - self._homedepot_ca_products = HomeDepotCAProducts(self._engine) - return self._homedepot_ca_products - - @property - def lowes_products(self) -> LowesProducts: - """Lowes Products dataset.""" - if self._lowes_products is None: - self._lowes_products = LowesProducts(self._engine) - return self._lowes_products - - @property - def rona_products(self) -> RonaProducts: - """Rona.ca Products dataset.""" - if self._rona_products is None: - self._rona_products = RonaProducts(self._engine) - return self._rona_products - - @property - def kroger_products(self) -> KrogerProducts: - """Kroger Products dataset.""" - if self._kroger_products is None: - self._kroger_products = KrogerProducts(self._engine) - return self._kroger_products - - @property - def macys_products(self) -> MacysProducts: - """Macys Products dataset.""" - if self._macys_products is None: - self._macys_products = MacysProducts(self._engine) - return self._macys_products - - @property - def costco_products(self) -> CostcoProducts: - """Costco Products dataset.""" - if self._costco_products is None: - self._costco_products = CostcoProducts(self._engine) - return self._costco_products - - @property - def bh_products(self) -> BHProducts: - """B&H Products dataset.""" - if self._bh_products is None: - self._bh_products = BHProducts(self._engine) - return self._bh_products - - @property - def microcenter_products(self) -> MicroCenterProducts: - """Micro Center Products dataset.""" - if self._microcenter_products is None: - self._microcenter_products = MicroCenterProducts(self._engine) - return self._microcenter_products - - @property - def autozone_products(self) -> AutozoneProducts: - """AutoZone Products dataset.""" - if self._autozone_products is None: - self._autozone_products = AutozoneProducts(self._engine) - return self._autozone_products - - # --- New dataset properties - Real Estate/Travel --- - - @property - def zillow_price_history(self) -> ZillowPriceHistory: - """Zillow Price History dataset.""" - if self._zillow_price_history is None: - self._zillow_price_history = ZillowPriceHistory(self._engine) - return self._zillow_price_history - - @property - def zoopla_properties(self) -> ZooplaProperties: - """Zoopla Properties dataset.""" - if self._zoopla_properties is None: - self._zoopla_properties = ZooplaProperties(self._engine) - return self._zoopla_properties - - @property - def booking_listings_search(self) -> BookingListingsSearch: - """Booking.com Listings Search dataset.""" - if self._booking_listings_search is None: - self._booking_listings_search = BookingListingsSearch(self._engine) - return self._booking_listings_search - - @property - def booking_hotel_listings(self) -> BookingHotelListings: - """Booking.com Hotel Listings dataset.""" - if self._booking_hotel_listings is None: - self._booking_hotel_listings = BookingHotelListings(self._engine) - return self._booking_hotel_listings - - @property - def realtor_international_properties(self) -> RealtorInternationalProperties: - """Realtor International Properties dataset.""" - if self._realtor_international_properties is None: - self._realtor_international_properties = RealtorInternationalProperties(self._engine) - return self._realtor_international_properties - - @property - def agoda_properties(self) -> AgodaProperties: - """Agoda Properties dataset.""" - if self._agoda_properties is None: - self._agoda_properties = AgodaProperties(self._engine) - return self._agoda_properties - - @property - def carsales_listings(self) -> CarsalesListings: - """Carsales Car Listings dataset.""" - if self._carsales_listings is None: - self._carsales_listings = CarsalesListings(self._engine) - return self._carsales_listings - - # --- New dataset properties - Finance/Maps --- - - @property - def yahoo_finance_businesses(self) -> YahooFinanceBusinesses: - """Yahoo Finance Businesses dataset.""" - if self._yahoo_finance_businesses is None: - self._yahoo_finance_businesses = YahooFinanceBusinesses(self._engine) - return self._yahoo_finance_businesses - - @property - def google_maps_full_info(self) -> GoogleMapsFullInfo: - """Google Maps Full Info dataset.""" - if self._google_maps_full_info is None: - self._google_maps_full_info = GoogleMapsFullInfo(self._engine) - return self._google_maps_full_info diff --git a/src/brightdata/datasets/client.pyi b/src/brightdata/datasets/client.pyi new file mode 100644 index 0000000..53d2f30 --- /dev/null +++ b/src/brightdata/datasets/client.pyi @@ -0,0 +1,1111 @@ +"""Type stub for DatasetsClient — provides IDE autocomplete.""" + +from typing import List +from .models import DatasetInfo +from ..core.engine import AsyncEngine + +from .agoda import AgodaProperties +from .airbnb import AirbnbProperties +from .amazon import ( + AmazonBestSellers, + AmazonProducts, + AmazonProductsGlobal, + AmazonProductsSearch, + AmazonReviews, + AmazonSellersInfo, + AmazonWalmart, +) +from .american_eagle import AmericanEagleProducts +from .apple_appstore import ( + AppleAppStore, + AppleAppStoreReviews, +) +from .ashley_furniture import AshleyFurnitureProducts +from .asos import AsosProducts +from .autozone import AutozoneProducts +from .balenciaga import BalenciagaProducts +from .bbc import BBCNews +from .berluti import BerlutiProducts +from .bestbuy import BestBuyProducts +from .bh import BHProducts +from .bluesky import ( + BlueskyPosts, + BlueskyTopProfiles, +) +from .booking import ( + BookingHotelListings, + BookingListingsSearch, +) +from .bottegaveneta import BottegaVenetaProducts +from .carsales import CarsalesListings +from .carters import CartersProducts +from .celine import CelineProducts +from .chanel import ChanelProducts +from .chileautos import ChileautosChile +from .cnn import CNNNews +from .companies_enriched import CompaniesEnriched +from .costco import CostcoProducts +from .crateandbarrel import CrateAndBarrelProducts +from .creative_commons import ( + CreativeCommons3DModels, + CreativeCommonsImages, +) +from .crunchbase import CrunchbaseCompanies +from .delvaux import DelvauxProducts +from .digikey import DigikeyProducts +from .dior import DiorProducts +from .ebay import EbayProducts +from .employees_enriched import EmployeesEnriched +from .etsy import EtsyProducts +from .facebook import ( + FacebookComments, + FacebookCompanyReviews, + FacebookEvents, + FacebookGroupPosts, + FacebookMarketplace, + FacebookPagesPosts, + FacebookPagesProfiles, + FacebookPostsByUrl, + FacebookProfiles, + FacebookReels, +) +from .fanatics import FanaticsProducts +from .fendi import FendiProducts +from .g2 import ( + G2Products, + G2Reviews, +) +from .github import GithubRepositories +from .glassdoor import ( + GlassdoorCompanies, + GlassdoorJobs, + GlassdoorReviews, +) +from .goodreads import GoodreadsBooks +from .google_maps import ( + GoogleMapsFullInfo, + GoogleMapsReviews, +) +from .google_news import GoogleNews +from .google_play import ( + GooglePlayReviews, + GooglePlayStore, +) +from .google_shopping import ( + GoogleShoppingProducts, + GoogleShoppingSearchUS, +) +from .hermes import HermesProducts +from .hm import HMProducts +from .homedepot import ( + HomeDepotCAProducts, + HomeDepotUSProducts, +) +from .ikea import IkeaProducts +from .imdb import IMDBMovies +from .indeed import ( + IndeedCompanies, + IndeedJobs, +) +from .infocasas import InfocasasUruguay +from .inmuebles24 import Inmuebles24Mexico +from .instagram import ( + InstagramComments, + InstagramPosts, + InstagramProfiles, + InstagramReels, +) +from .kroger import KrogerProducts +from .lawyers import USLawyers +from .lazada import ( + LazadaProducts, + LazadaProductsSearch, + LazadaReviews, +) +from .lazboy import LaZBoyProducts +from .lego import LegoProducts +from .linkedin import ( + LinkedInCompanyProfiles, + LinkedInJobListings, + LinkedInPeopleProfiles, + LinkedInPosts, + LinkedInProfilesJobListings, +) +from .llbean import LLBeanProducts +from .loewe import LoeweProducts +from .lowes import LowesProducts +from .macys import MacysProducts +from .mango import MangoProducts +from .manta import MantaBusinesses +from .massimo_dutti import MassimoDuttiProducts +from .mattressfirm import MattressfirmProducts +from .mediamarkt import MediamarktProducts +from .mercadolivre import MercadolivreProducts +from .metrocuadrado import MetrocuadradoProperties +from .microcenter import MicroCenterProducts +from .montblanc import MontblancProducts +from .mouser import MouserProducts +from .moynat import MoynatProducts +from .mybobs import MybobsProducts +from .myntra import MyntraProducts +from .naver import NaverProducts +from .nba import NBAPlayersStats +from .olx import OLXBrazil +from .otodom import OtodomPoland +from .owler import OwlerCompanies +from .ozon import OzonProducts +from .pinterest import ( + PinterestPosts, + PinterestProfiles, +) +from .pitchbook import PitchBookCompanies +from .prada import PradaProducts +from .properati import ProperatiProperties +from .quora import QuoraPosts +from .raymourflanigan import RaymourFlaniganProducts +from .real_estate import AustraliaRealEstate +from .realtor import RealtorInternationalProperties +from .reddit import ( + RedditComments, + RedditPosts, +) +from .rona import RonaProducts +from .sephora import SephoraProducts +from .shein import SheinProducts +from .shopee import ShopeeProducts +from .sleepnumber import SleepNumberProducts +from .slintel import SlintelCompanies +from .snapchat import SnapchatPosts +from .target import TargetProducts +from .tiktok import ( + TikTokComments, + TikTokPosts, + TikTokProfiles, + TikTokShop, +) +from .toctoc import ToctocProperties +from .tokopedia import TokopediaProducts +from .toysrus import ToysRUsProducts +from .trustpilot import TrustpilotReviews +from .trustradius import TrustRadiusReviews +from .ventureradar import VentureRadarCompanies +from .vimeo import VimeoVideos +from .walmart import ( + WalmartProducts, + WalmartSellersInfo, +) +from .wayfair import WayfairProducts +from .webmotors import WebmotorsBrasil +from .wikipedia import WikipediaArticles +from .wildberries import WildberriesProducts +from .world_population import WorldPopulation +from .world_zipcodes import WorldZipcodes +from .x_twitter import ( + XTwitterPosts, + XTwitterProfiles, +) +from .xing import XingProfiles +from .yahoo_finance import YahooFinanceBusinesses +from .yapo import YapoChile +from .yelp import ( + YelpBusinesses, + YelpReviews, +) +from .youtube import ( + YouTubeComments, + YouTubeProfiles, + YouTubeVideos, +) +from .ysl import YSLProducts +from .zalando import ZalandoProducts +from .zara import ( + ZaraHomeProducts, + ZaraProducts, +) +from .zillow import ( + ZillowPriceHistory, + ZillowProperties, +) +from .zonaprop import ZonapropArgentina +from .zoominfo import ZoomInfoCompanies +from .zoopla import ZooplaProperties + +class DatasetsClient: + BASE_URL: str + + def __init__(self, engine: AsyncEngine) -> None: ... + async def list(self) -> List[DatasetInfo]: ... + @property + def linkedin_profiles(self) -> LinkedInPeopleProfiles: + """LinkedIn People Profiles dataset (620M+ records).""" + ... + + @property + def linkedin_companies(self) -> LinkedInCompanyProfiles: + """LinkedIn Company Profiles dataset.""" + ... + + @property + def linkedin_job_listings(self) -> LinkedInJobListings: + """LinkedIn Profiles Jobs Listings dataset.""" + ... + + @property + def amazon_products(self) -> AmazonProducts: + """Amazon Products dataset.""" + ... + + @property + def amazon_reviews(self) -> AmazonReviews: + """Amazon Reviews dataset.""" + ... + + @property + def crunchbase_companies(self) -> CrunchbaseCompanies: + """Crunchbase Companies dataset (2.3M+ records).""" + ... + + @property + def imdb_movies(self) -> IMDBMovies: + """IMDB Movies dataset (867K+ records).""" + ... + + @property + def nba_players_stats(self) -> NBAPlayersStats: + """NBA Players Stats dataset (17K+ records).""" + ... + + @property + def goodreads_books(self) -> GoodreadsBooks: + """Goodreads Books dataset.""" + ... + + @property + def world_population(self) -> WorldPopulation: + """World Population dataset.""" + ... + + @property + def companies_enriched(self) -> CompaniesEnriched: + """Companies Enriched dataset - multi-source company information.""" + ... + + @property + def employees_enriched(self) -> EmployeesEnriched: + """Employees Business Enriched dataset - LinkedIn profiles with company data.""" + ... + + @property + def glassdoor_companies(self) -> GlassdoorCompanies: + """Glassdoor Companies Overview dataset - ratings, reviews, and company details.""" + ... + + @property + def glassdoor_reviews(self) -> GlassdoorReviews: + """Glassdoor Companies Reviews dataset - employee reviews and ratings.""" + ... + + @property + def glassdoor_jobs(self) -> GlassdoorJobs: + """Glassdoor Job Listings dataset - job postings with company data.""" + ... + + @property + def google_maps_reviews(self) -> GoogleMapsReviews: + """Google Maps Reviews dataset - place reviews and ratings.""" + ... + + @property + def yelp_businesses(self) -> YelpBusinesses: + """Yelp Businesses Overview dataset - business listings and ratings.""" + ... + + @property + def yelp_reviews(self) -> YelpReviews: + """Yelp Business Reviews dataset - individual business reviews.""" + ... + + @property + def zoominfo_companies(self) -> ZoomInfoCompanies: + """ZoomInfo Companies dataset - company data with financials and contacts.""" + ... + + @property + def pitchbook_companies(self) -> PitchBookCompanies: + """PitchBook Companies dataset - PE/VC company data with deals.""" + ... + + @property + def g2_products(self) -> G2Products: + """G2 Software Product Overview dataset - software ratings and reviews.""" + ... + + @property + def g2_reviews(self) -> G2Reviews: + """G2 Software Product Reviews dataset - individual product reviews.""" + ... + + @property + def trustpilot_reviews(self) -> TrustpilotReviews: + """Trustpilot Business Reviews dataset - company reviews and ratings.""" + ... + + @property + def indeed_companies(self) -> IndeedCompanies: + """Indeed Companies Info dataset - company profiles with jobs and reviews.""" + ... + + @property + def xing_profiles(self) -> XingProfiles: + """Xing Social Network Profiles dataset - professional profiles.""" + ... + + @property + def slintel_companies(self) -> SlintelCompanies: + """Slintel 6sense Company Information dataset - technographics and company data.""" + ... + + @property + def owler_companies(self) -> OwlerCompanies: + """Owler Companies Information dataset - competitive intelligence and metrics.""" + ... + + @property + def us_lawyers(self) -> USLawyers: + """US Lawyers Directory dataset - lawyer profiles and practice areas.""" + ... + + @property + def manta_businesses(self) -> MantaBusinesses: + """Manta Businesses dataset - business listings with revenue and employees.""" + ... + + @property + def ventureradar_companies(self) -> VentureRadarCompanies: + """VentureRadar Company Information dataset - startup intelligence.""" + ... + + @property + def trustradius_reviews(self) -> TrustRadiusReviews: + """TrustRadius Product Reviews dataset - software product reviews.""" + ... + + @property + def instagram_profiles(self) -> InstagramProfiles: + """Instagram Profiles dataset - user profiles and engagement.""" + ... + + @property + def tiktok_profiles(self) -> TikTokProfiles: + """TikTok Profiles dataset - user profiles and engagement.""" + ... + + @property + def australia_real_estate(self) -> AustraliaRealEstate: + """Australia Real Estate Properties dataset.""" + ... + + @property + def indeed_jobs(self) -> IndeedJobs: + """Indeed Job Listings dataset.""" + ... + + @property + def walmart_products(self) -> WalmartProducts: + """Walmart Products dataset.""" + ... + + @property + def mediamarkt_products(self) -> MediamarktProducts: + """Mediamarkt.de Products dataset.""" + ... + + @property + def fendi_products(self) -> FendiProducts: + """Fendi Products dataset.""" + ... + + @property + def zalando_products(self) -> ZalandoProducts: + """Zalando Products dataset.""" + ... + + @property + def sephora_products(self) -> SephoraProducts: + """Sephora Products dataset.""" + ... + + @property + def zara_products(self) -> ZaraProducts: + """Zara Products dataset.""" + ... + + @property + def zara_home_products(self) -> ZaraHomeProducts: + """Zara Home Products dataset.""" + ... + + @property + def mango_products(self) -> MangoProducts: + """Mango Products dataset.""" + ... + + @property + def massimo_dutti_products(self) -> MassimoDuttiProducts: + """Massimo Dutti Products dataset.""" + ... + + @property + def otodom_poland(self) -> OtodomPoland: + """Otodom Poland real estate dataset.""" + ... + + @property + def webmotors_brasil(self) -> WebmotorsBrasil: + """Webmotors Brasil vehicle listings dataset.""" + ... + + @property + def airbnb_properties(self) -> AirbnbProperties: + """Airbnb Properties dataset.""" + ... + + @property + def asos_products(self) -> AsosProducts: + """Asos Products dataset.""" + ... + + @property + def chanel_products(self) -> ChanelProducts: + """Chanel Products dataset.""" + ... + + @property + def ashley_furniture_products(self) -> AshleyFurnitureProducts: + """Ashley Furniture Products dataset.""" + ... + + @property + def fanatics_products(self) -> FanaticsProducts: + """Fanatics Products dataset.""" + ... + + @property + def carters_products(self) -> CartersProducts: + """Carters Products dataset.""" + ... + + @property + def american_eagle_products(self) -> AmericanEagleProducts: + """American Eagle Products dataset.""" + ... + + @property + def ikea_products(self) -> IkeaProducts: + """Ikea Products dataset.""" + ... + + @property + def hm_products(self) -> HMProducts: + """H&M Products dataset.""" + ... + + @property + def lego_products(self) -> LegoProducts: + """Lego Products dataset.""" + ... + + @property + def mattressfirm_products(self) -> MattressfirmProducts: + """Mattressfirm Products dataset.""" + ... + + @property + def crateandbarrel_products(self) -> CrateAndBarrelProducts: + """Crate and Barrel Products dataset.""" + ... + + @property + def llbean_products(self) -> LLBeanProducts: + """L.L. Bean Products dataset.""" + ... + + @property + def shein_products(self) -> SheinProducts: + """Shein Products dataset.""" + ... + + @property + def toysrus_products(self) -> ToysRUsProducts: + """Toys R Us Products dataset.""" + ... + + @property + def mybobs_products(self) -> MybobsProducts: + """Mybobs Products dataset.""" + ... + + @property + def sleepnumber_products(self) -> SleepNumberProducts: + """Sleep Number Products dataset.""" + ... + + @property + def raymourflanigan_products(self) -> RaymourFlaniganProducts: + """Raymour and Flanigan Products dataset.""" + ... + + @property + def inmuebles24_mexico(self) -> Inmuebles24Mexico: + """Inmuebles24 Mexico real estate dataset.""" + ... + + @property + def mouser_products(self) -> MouserProducts: + """Mouser Products dataset.""" + ... + + @property + def zillow_properties(self) -> ZillowProperties: + """Zillow Properties dataset.""" + ... + + @property + def zonaprop_argentina(self) -> ZonapropArgentina: + """Zonaprop Argentina real estate dataset.""" + ... + + @property + def metrocuadrado_properties(self) -> MetrocuadradoProperties: + """Metrocuadrado Properties dataset.""" + ... + + @property + def chileautos_chile(self) -> ChileautosChile: + """Chileautos Chile car listings dataset.""" + ... + + @property + def infocasas_uruguay(self) -> InfocasasUruguay: + """Infocasas Uruguay real estate dataset.""" + ... + + @property + def lazboy_products(self) -> LaZBoyProducts: + """La-Z-Boy Products dataset.""" + ... + + @property + def properati_properties(self) -> ProperatiProperties: + """Properati Properties dataset.""" + ... + + @property + def yapo_chile(self) -> YapoChile: + """Yapo Chile marketplace ads dataset.""" + ... + + @property + def toctoc_properties(self) -> ToctocProperties: + """Toctoc Properties dataset.""" + ... + + @property + def dior_products(self) -> DiorProducts: + """Dior Products dataset.""" + ... + + @property + def balenciaga_products(self) -> BalenciagaProducts: + """Balenciaga Products dataset.""" + ... + + @property + def bottegaveneta_products(self) -> BottegaVenetaProducts: + """Bottega Veneta Products dataset.""" + ... + + @property + def olx_brazil(self) -> OLXBrazil: + """OLX Brazil marketplace ads dataset.""" + ... + + @property + def celine_products(self) -> CelineProducts: + """Celine Products dataset.""" + ... + + @property + def loewe_products(self) -> LoeweProducts: + """Loewe Products dataset.""" + ... + + @property + def berluti_products(self) -> BerlutiProducts: + """Berluti Products dataset.""" + ... + + @property + def moynat_products(self) -> MoynatProducts: + """Moynat Products dataset.""" + ... + + @property + def hermes_products(self) -> HermesProducts: + """Hermes Products dataset.""" + ... + + @property + def delvaux_products(self) -> DelvauxProducts: + """Delvaux Products dataset.""" + ... + + @property + def prada_products(self) -> PradaProducts: + """Prada Products dataset.""" + ... + + @property + def montblanc_products(self) -> MontblancProducts: + """Montblanc Products dataset.""" + ... + + @property + def ysl_products(self) -> YSLProducts: + """YSL Products dataset.""" + ... + + @property + def amazon_sellers_info(self) -> AmazonSellersInfo: + """Amazon Sellers Info dataset.""" + ... + + @property + def world_zipcodes(self) -> WorldZipcodes: + """World Zipcodes dataset.""" + ... + + @property + def pinterest_posts(self) -> PinterestPosts: + """Pinterest Posts dataset.""" + ... + + @property + def pinterest_profiles(self) -> PinterestProfiles: + """Pinterest Profiles dataset.""" + ... + + @property + def shopee_products(self) -> ShopeeProducts: + """Shopee Products dataset.""" + ... + + @property + def lazada_products(self) -> LazadaProducts: + """Lazada Products dataset.""" + ... + + @property + def instagram_posts(self) -> InstagramPosts: + """Instagram Posts dataset.""" + ... + + @property + def youtube_profiles(self) -> YouTubeProfiles: + """YouTube Profiles dataset.""" + ... + + @property + def youtube_videos(self) -> YouTubeVideos: + """YouTube Videos dataset.""" + ... + + @property + def youtube_comments(self) -> YouTubeComments: + """YouTube Comments dataset.""" + ... + + @property + def digikey_products(self) -> DigikeyProducts: + """Digikey Products dataset.""" + ... + + @property + def facebook_pages_posts(self) -> FacebookPagesPosts: + """Facebook Pages Posts dataset.""" + ... + + @property + def facebook_comments(self) -> FacebookComments: + """Facebook Comments dataset.""" + ... + + @property + def facebook_posts_by_url(self) -> FacebookPostsByUrl: + """Facebook Posts by URL dataset.""" + ... + + @property + def facebook_reels(self) -> FacebookReels: + """Facebook Reels dataset.""" + ... + + @property + def facebook_marketplace(self) -> FacebookMarketplace: + """Facebook Marketplace dataset.""" + ... + + @property + def facebook_company_reviews(self) -> FacebookCompanyReviews: + """Facebook Company Reviews dataset.""" + ... + + @property + def facebook_events(self) -> FacebookEvents: + """Facebook Events dataset.""" + ... + + @property + def facebook_profiles(self) -> FacebookProfiles: + """Facebook Profiles dataset.""" + ... + + @property + def facebook_pages_profiles(self) -> FacebookPagesProfiles: + """Facebook Pages and Profiles dataset.""" + ... + + @property + def facebook_group_posts(self) -> FacebookGroupPosts: + """Facebook Group Posts dataset.""" + ... + + @property + def tiktok_comments(self) -> TikTokComments: + """TikTok Comments dataset.""" + ... + + @property + def tiktok_posts(self) -> TikTokPosts: + """TikTok Posts dataset.""" + ... + + @property + def tiktok_shop(self) -> TikTokShop: + """TikTok Shop dataset.""" + ... + + @property + def instagram_comments(self) -> InstagramComments: + """Instagram Comments dataset.""" + ... + + @property + def instagram_reels(self) -> InstagramReels: + """Instagram Reels dataset.""" + ... + + @property + def linkedin_posts(self) -> LinkedInPosts: + """LinkedIn Posts dataset.""" + ... + + @property + def linkedin_profiles_job_listings(self) -> LinkedInProfilesJobListings: + """LinkedIn Profiles Job Listings dataset.""" + ... + + @property + def x_twitter_posts(self) -> XTwitterPosts: + """X (Twitter) Posts dataset.""" + ... + + @property + def x_twitter_profiles(self) -> XTwitterProfiles: + """X (Twitter) Profiles dataset.""" + ... + + @property + def reddit_posts(self) -> RedditPosts: + """Reddit Posts dataset.""" + ... + + @property + def reddit_comments(self) -> RedditComments: + """Reddit Comments dataset.""" + ... + + @property + def bluesky_posts(self) -> BlueskyPosts: + """Bluesky Posts dataset.""" + ... + + @property + def bluesky_top_profiles(self) -> BlueskyTopProfiles: + """Top 500 Bluesky Profiles dataset.""" + ... + + @property + def snapchat_posts(self) -> SnapchatPosts: + """Snapchat Posts dataset.""" + ... + + @property + def quora_posts(self) -> QuoraPosts: + """Quora Posts dataset.""" + ... + + @property + def vimeo_videos(self) -> VimeoVideos: + """Vimeo Videos dataset.""" + ... + + @property + def google_news(self) -> GoogleNews: + """Google News dataset.""" + ... + + @property + def wikipedia_articles(self) -> WikipediaArticles: + """Wikipedia Articles dataset.""" + ... + + @property + def bbc_news(self) -> BBCNews: + """BBC News dataset.""" + ... + + @property + def cnn_news(self) -> CNNNews: + """CNN News dataset.""" + ... + + @property + def github_repositories(self) -> GithubRepositories: + """GitHub Repositories dataset.""" + ... + + @property + def creative_commons_images(self) -> CreativeCommonsImages: + """Creative Commons Images dataset.""" + ... + + @property + def creative_commons_3d_models(self) -> CreativeCommons3DModels: + """Creative Commons 3D Models dataset.""" + ... + + @property + def google_play_store(self) -> GooglePlayStore: + """Google Play Store dataset.""" + ... + + @property + def google_play_reviews(self) -> GooglePlayReviews: + """Google Play Store Reviews dataset.""" + ... + + @property + def apple_app_store(self) -> AppleAppStore: + """Apple App Store dataset.""" + ... + + @property + def apple_app_store_reviews(self) -> AppleAppStoreReviews: + """Apple App Store Reviews dataset.""" + ... + + @property + def amazon_best_sellers(self) -> AmazonBestSellers: + """Amazon Best Sellers dataset.""" + ... + + @property + def amazon_products_search(self) -> AmazonProductsSearch: + """Amazon Products Search dataset.""" + ... + + @property + def amazon_products_global(self) -> AmazonProductsGlobal: + """Amazon Products Global dataset.""" + ... + + @property + def amazon_walmart(self) -> AmazonWalmart: + """Amazon Walmart dataset.""" + ... + + @property + def walmart_sellers_info(self) -> WalmartSellersInfo: + """Walmart Sellers Info dataset.""" + ... + + @property + def ebay_products(self) -> EbayProducts: + """eBay Products dataset.""" + ... + + @property + def etsy_products(self) -> EtsyProducts: + """Etsy Products dataset.""" + ... + + @property + def target_products(self) -> TargetProducts: + """Target Products dataset.""" + ... + + @property + def wayfair_products(self) -> WayfairProducts: + """Wayfair Products dataset.""" + ... + + @property + def bestbuy_products(self) -> BestBuyProducts: + """Best Buy Products dataset.""" + ... + + @property + def myntra_products(self) -> MyntraProducts: + """Myntra Products dataset.""" + ... + + @property + def ozon_products(self) -> OzonProducts: + """Ozon.ru Products dataset.""" + ... + + @property + def wildberries_products(self) -> WildberriesProducts: + """Wildberries.ru Products dataset.""" + ... + + @property + def tokopedia_products(self) -> TokopediaProducts: + """Tokopedia Products dataset.""" + ... + + @property + def google_shopping_products(self) -> GoogleShoppingProducts: + """Google Shopping Products dataset.""" + ... + + @property + def google_shopping_search_us(self) -> GoogleShoppingSearchUS: + """Google Shopping Search US dataset.""" + ... + + @property + def mercadolivre_products(self) -> MercadolivreProducts: + """MercadoLivre Products dataset.""" + ... + + @property + def naver_products(self) -> NaverProducts: + """Naver Products dataset.""" + ... + + @property + def lazada_reviews(self) -> LazadaReviews: + """Lazada Reviews dataset.""" + ... + + @property + def lazada_products_search(self) -> LazadaProductsSearch: + """Lazada Products Search dataset.""" + ... + + @property + def homedepot_us_products(self) -> HomeDepotUSProducts: + """Home Depot US Products dataset.""" + ... + + @property + def homedepot_ca_products(self) -> HomeDepotCAProducts: + """Home Depot Canada Products dataset.""" + ... + + @property + def lowes_products(self) -> LowesProducts: + """Lowes Products dataset.""" + ... + + @property + def rona_products(self) -> RonaProducts: + """Rona.ca Products dataset.""" + ... + + @property + def kroger_products(self) -> KrogerProducts: + """Kroger Products dataset.""" + ... + + @property + def macys_products(self) -> MacysProducts: + """Macys Products dataset.""" + ... + + @property + def costco_products(self) -> CostcoProducts: + """Costco Products dataset.""" + ... + + @property + def bh_products(self) -> BHProducts: + """B&H Products dataset.""" + ... + + @property + def microcenter_products(self) -> MicroCenterProducts: + """Micro Center Products dataset.""" + ... + + @property + def autozone_products(self) -> AutozoneProducts: + """AutoZone Products dataset.""" + ... + + @property + def zillow_price_history(self) -> ZillowPriceHistory: + """Zillow Price History dataset.""" + ... + + @property + def zoopla_properties(self) -> ZooplaProperties: + """Zoopla Properties dataset.""" + ... + + @property + def booking_listings_search(self) -> BookingListingsSearch: + """Booking.com Listings Search dataset.""" + ... + + @property + def booking_hotel_listings(self) -> BookingHotelListings: + """Booking.com Hotel Listings dataset.""" + ... + + @property + def realtor_international_properties(self) -> RealtorInternationalProperties: + """Realtor International Properties dataset.""" + ... + + @property + def agoda_properties(self) -> AgodaProperties: + """Agoda Properties dataset.""" + ... + + @property + def carsales_listings(self) -> CarsalesListings: + """Carsales Car Listings dataset.""" + ... + + @property + def yahoo_finance_businesses(self) -> YahooFinanceBusinesses: + """Yahoo Finance Businesses dataset.""" + ... + + @property + def google_maps_full_info(self) -> GoogleMapsFullInfo: + """Google Maps Full Info dataset.""" + ... diff --git a/src/brightdata/datasets/cnn/news.py b/src/brightdata/datasets/cnn/news.py index ad3e4f4..880388e 100644 --- a/src/brightdata/datasets/cnn/news.py +++ b/src/brightdata/datasets/cnn/news.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class CNNNews(BaseDataset): diff --git a/src/brightdata/datasets/companies_enriched/companies.py b/src/brightdata/datasets/companies_enriched/companies.py index c6a7bca..4649da4 100644 --- a/src/brightdata/datasets/companies_enriched/companies.py +++ b/src/brightdata/datasets/companies_enriched/companies.py @@ -18,7 +18,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine # Data source suffixes diff --git a/src/brightdata/datasets/costco/products.py b/src/brightdata/datasets/costco/products.py index 6b0ed32..a0e1163 100644 --- a/src/brightdata/datasets/costco/products.py +++ b/src/brightdata/datasets/costco/products.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class CostcoProducts(BaseDataset): diff --git a/src/brightdata/datasets/crateandbarrel/products.py b/src/brightdata/datasets/crateandbarrel/products.py index 4f5ce3a..824b553 100644 --- a/src/brightdata/datasets/crateandbarrel/products.py +++ b/src/brightdata/datasets/crateandbarrel/products.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class CrateAndBarrelProducts(BaseDataset): diff --git a/src/brightdata/datasets/creative_commons/images.py b/src/brightdata/datasets/creative_commons/images.py index f416875..ab56ff7 100644 --- a/src/brightdata/datasets/creative_commons/images.py +++ b/src/brightdata/datasets/creative_commons/images.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class CreativeCommonsImages(BaseDataset): diff --git a/src/brightdata/datasets/creative_commons/models_3d.py b/src/brightdata/datasets/creative_commons/models_3d.py index 0aa9448..6bcd676 100644 --- a/src/brightdata/datasets/creative_commons/models_3d.py +++ b/src/brightdata/datasets/creative_commons/models_3d.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class CreativeCommons3DModels(BaseDataset): diff --git a/src/brightdata/datasets/crunchbase/companies.py b/src/brightdata/datasets/crunchbase/companies.py index 18e8051..616d158 100644 --- a/src/brightdata/datasets/crunchbase/companies.py +++ b/src/brightdata/datasets/crunchbase/companies.py @@ -12,7 +12,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class CrunchbaseCompanies(BaseDataset): diff --git a/src/brightdata/datasets/delvaux/products.py b/src/brightdata/datasets/delvaux/products.py index 8a56114..d43daee 100644 --- a/src/brightdata/datasets/delvaux/products.py +++ b/src/brightdata/datasets/delvaux/products.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class DelvauxProducts(BaseDataset): diff --git a/src/brightdata/datasets/digikey/products.py b/src/brightdata/datasets/digikey/products.py index 499dfc3..fc30ba0 100644 --- a/src/brightdata/datasets/digikey/products.py +++ b/src/brightdata/datasets/digikey/products.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class DigikeyProducts(BaseDataset): diff --git a/src/brightdata/datasets/dior/products.py b/src/brightdata/datasets/dior/products.py index ab05600..b638f2d 100644 --- a/src/brightdata/datasets/dior/products.py +++ b/src/brightdata/datasets/dior/products.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class DiorProducts(BaseDataset): diff --git a/src/brightdata/datasets/ebay/products.py b/src/brightdata/datasets/ebay/products.py index f1b189e..cef3397 100644 --- a/src/brightdata/datasets/ebay/products.py +++ b/src/brightdata/datasets/ebay/products.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class EbayProducts(BaseDataset): diff --git a/src/brightdata/datasets/employees_enriched/employees.py b/src/brightdata/datasets/employees_enriched/employees.py index 9ba0da1..da533ef 100644 --- a/src/brightdata/datasets/employees_enriched/employees.py +++ b/src/brightdata/datasets/employees_enriched/employees.py @@ -13,7 +13,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine # Field categories for organization diff --git a/src/brightdata/datasets/etsy/products.py b/src/brightdata/datasets/etsy/products.py index 1243eef..d085431 100644 --- a/src/brightdata/datasets/etsy/products.py +++ b/src/brightdata/datasets/etsy/products.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class EtsyProducts(BaseDataset): diff --git a/src/brightdata/datasets/facebook/comments.py b/src/brightdata/datasets/facebook/comments.py index 07ca75a..f7ec84a 100644 --- a/src/brightdata/datasets/facebook/comments.py +++ b/src/brightdata/datasets/facebook/comments.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class FacebookComments(BaseDataset): diff --git a/src/brightdata/datasets/facebook/company_reviews.py b/src/brightdata/datasets/facebook/company_reviews.py index 6a475dc..b9f0fee 100644 --- a/src/brightdata/datasets/facebook/company_reviews.py +++ b/src/brightdata/datasets/facebook/company_reviews.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class FacebookCompanyReviews(BaseDataset): diff --git a/src/brightdata/datasets/facebook/events.py b/src/brightdata/datasets/facebook/events.py index 21db6c4..9b4ad79 100644 --- a/src/brightdata/datasets/facebook/events.py +++ b/src/brightdata/datasets/facebook/events.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class FacebookEvents(BaseDataset): diff --git a/src/brightdata/datasets/facebook/group_posts.py b/src/brightdata/datasets/facebook/group_posts.py index abfadc6..6f93e61 100644 --- a/src/brightdata/datasets/facebook/group_posts.py +++ b/src/brightdata/datasets/facebook/group_posts.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class FacebookGroupPosts(BaseDataset): diff --git a/src/brightdata/datasets/facebook/marketplace.py b/src/brightdata/datasets/facebook/marketplace.py index e240930..f44faad 100644 --- a/src/brightdata/datasets/facebook/marketplace.py +++ b/src/brightdata/datasets/facebook/marketplace.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class FacebookMarketplace(BaseDataset): diff --git a/src/brightdata/datasets/facebook/pages_posts.py b/src/brightdata/datasets/facebook/pages_posts.py index bccc3b6..f00a353 100644 --- a/src/brightdata/datasets/facebook/pages_posts.py +++ b/src/brightdata/datasets/facebook/pages_posts.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class FacebookPagesPosts(BaseDataset): diff --git a/src/brightdata/datasets/facebook/pages_profiles.py b/src/brightdata/datasets/facebook/pages_profiles.py index 9094631..42637c4 100644 --- a/src/brightdata/datasets/facebook/pages_profiles.py +++ b/src/brightdata/datasets/facebook/pages_profiles.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class FacebookPagesProfiles(BaseDataset): diff --git a/src/brightdata/datasets/facebook/posts_by_url.py b/src/brightdata/datasets/facebook/posts_by_url.py index 76e33b4..52fdd19 100644 --- a/src/brightdata/datasets/facebook/posts_by_url.py +++ b/src/brightdata/datasets/facebook/posts_by_url.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class FacebookPostsByUrl(BaseDataset): diff --git a/src/brightdata/datasets/facebook/profiles.py b/src/brightdata/datasets/facebook/profiles.py index 8a18e0c..ef3e5a4 100644 --- a/src/brightdata/datasets/facebook/profiles.py +++ b/src/brightdata/datasets/facebook/profiles.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class FacebookProfiles(BaseDataset): diff --git a/src/brightdata/datasets/facebook/reels.py b/src/brightdata/datasets/facebook/reels.py index 78ad7f4..cae1240 100644 --- a/src/brightdata/datasets/facebook/reels.py +++ b/src/brightdata/datasets/facebook/reels.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class FacebookReels(BaseDataset): diff --git a/src/brightdata/datasets/fanatics/products.py b/src/brightdata/datasets/fanatics/products.py index 28faff2..fa11422 100644 --- a/src/brightdata/datasets/fanatics/products.py +++ b/src/brightdata/datasets/fanatics/products.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class FanaticsProducts(BaseDataset): diff --git a/src/brightdata/datasets/fendi/products.py b/src/brightdata/datasets/fendi/products.py index d8238cd..853782d 100644 --- a/src/brightdata/datasets/fendi/products.py +++ b/src/brightdata/datasets/fendi/products.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class FendiProducts(BaseDataset): diff --git a/src/brightdata/datasets/g2/products.py b/src/brightdata/datasets/g2/products.py index e3ca936..ae0df2b 100644 --- a/src/brightdata/datasets/g2/products.py +++ b/src/brightdata/datasets/g2/products.py @@ -12,7 +12,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine # Field categories diff --git a/src/brightdata/datasets/g2/reviews.py b/src/brightdata/datasets/g2/reviews.py index de8456a..def0053 100644 --- a/src/brightdata/datasets/g2/reviews.py +++ b/src/brightdata/datasets/g2/reviews.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine # Field categories diff --git a/src/brightdata/datasets/github/repositories.py b/src/brightdata/datasets/github/repositories.py index b72ec19..87c0fb7 100644 --- a/src/brightdata/datasets/github/repositories.py +++ b/src/brightdata/datasets/github/repositories.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class GithubRepositories(BaseDataset): diff --git a/src/brightdata/datasets/glassdoor/companies.py b/src/brightdata/datasets/glassdoor/companies.py index 25435cf..b0eece2 100644 --- a/src/brightdata/datasets/glassdoor/companies.py +++ b/src/brightdata/datasets/glassdoor/companies.py @@ -12,7 +12,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine # Field categories for organization diff --git a/src/brightdata/datasets/glassdoor/jobs.py b/src/brightdata/datasets/glassdoor/jobs.py index 4270103..cb77d5e 100644 --- a/src/brightdata/datasets/glassdoor/jobs.py +++ b/src/brightdata/datasets/glassdoor/jobs.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine # Field categories diff --git a/src/brightdata/datasets/glassdoor/reviews.py b/src/brightdata/datasets/glassdoor/reviews.py index 1d4d613..3990797 100644 --- a/src/brightdata/datasets/glassdoor/reviews.py +++ b/src/brightdata/datasets/glassdoor/reviews.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine # Field categories diff --git a/src/brightdata/datasets/goodreads/books.py b/src/brightdata/datasets/goodreads/books.py index 3c43689..abcc4e1 100644 --- a/src/brightdata/datasets/goodreads/books.py +++ b/src/brightdata/datasets/goodreads/books.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class GoodreadsBooks(BaseDataset): diff --git a/src/brightdata/datasets/google_maps/full_info.py b/src/brightdata/datasets/google_maps/full_info.py index 50f9b38..773558e 100644 --- a/src/brightdata/datasets/google_maps/full_info.py +++ b/src/brightdata/datasets/google_maps/full_info.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class GoogleMapsFullInfo(BaseDataset): diff --git a/src/brightdata/datasets/google_maps/reviews.py b/src/brightdata/datasets/google_maps/reviews.py index db6a0e9..ec8cb8b 100644 --- a/src/brightdata/datasets/google_maps/reviews.py +++ b/src/brightdata/datasets/google_maps/reviews.py @@ -12,7 +12,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine # Field categories diff --git a/src/brightdata/datasets/google_news/news.py b/src/brightdata/datasets/google_news/news.py index 984628e..1caffba 100644 --- a/src/brightdata/datasets/google_news/news.py +++ b/src/brightdata/datasets/google_news/news.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class GoogleNews(BaseDataset): diff --git a/src/brightdata/datasets/google_play/reviews.py b/src/brightdata/datasets/google_play/reviews.py index 2ee94c6..cdcd0c1 100644 --- a/src/brightdata/datasets/google_play/reviews.py +++ b/src/brightdata/datasets/google_play/reviews.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class GooglePlayReviews(BaseDataset): diff --git a/src/brightdata/datasets/google_play/store.py b/src/brightdata/datasets/google_play/store.py index 5604d80..a4b7fc3 100644 --- a/src/brightdata/datasets/google_play/store.py +++ b/src/brightdata/datasets/google_play/store.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class GooglePlayStore(BaseDataset): diff --git a/src/brightdata/datasets/google_shopping/products.py b/src/brightdata/datasets/google_shopping/products.py index ed6525a..82a1440 100644 --- a/src/brightdata/datasets/google_shopping/products.py +++ b/src/brightdata/datasets/google_shopping/products.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class GoogleShoppingProducts(BaseDataset): diff --git a/src/brightdata/datasets/google_shopping/search_us.py b/src/brightdata/datasets/google_shopping/search_us.py index 142b349..ad01086 100644 --- a/src/brightdata/datasets/google_shopping/search_us.py +++ b/src/brightdata/datasets/google_shopping/search_us.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class GoogleShoppingSearchUS(BaseDataset): diff --git a/src/brightdata/datasets/hermes/products.py b/src/brightdata/datasets/hermes/products.py index 9c1444a..ef2a134 100644 --- a/src/brightdata/datasets/hermes/products.py +++ b/src/brightdata/datasets/hermes/products.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class HermesProducts(BaseDataset): diff --git a/src/brightdata/datasets/hm/products.py b/src/brightdata/datasets/hm/products.py index 72287da..d528d82 100644 --- a/src/brightdata/datasets/hm/products.py +++ b/src/brightdata/datasets/hm/products.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class HMProducts(BaseDataset): diff --git a/src/brightdata/datasets/homedepot/products_ca.py b/src/brightdata/datasets/homedepot/products_ca.py index 35d5c4b..4d6b96a 100644 --- a/src/brightdata/datasets/homedepot/products_ca.py +++ b/src/brightdata/datasets/homedepot/products_ca.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class HomeDepotCAProducts(BaseDataset): diff --git a/src/brightdata/datasets/homedepot/products_us.py b/src/brightdata/datasets/homedepot/products_us.py index a2afee1..6a19cb2 100644 --- a/src/brightdata/datasets/homedepot/products_us.py +++ b/src/brightdata/datasets/homedepot/products_us.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class HomeDepotUSProducts(BaseDataset): diff --git a/src/brightdata/datasets/ikea/products.py b/src/brightdata/datasets/ikea/products.py index 89114e6..be19a90 100644 --- a/src/brightdata/datasets/ikea/products.py +++ b/src/brightdata/datasets/ikea/products.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class IkeaProducts(BaseDataset): diff --git a/src/brightdata/datasets/imdb/movies.py b/src/brightdata/datasets/imdb/movies.py index a7face3..21aee7a 100644 --- a/src/brightdata/datasets/imdb/movies.py +++ b/src/brightdata/datasets/imdb/movies.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class IMDBMovies(BaseDataset): diff --git a/src/brightdata/datasets/indeed/companies.py b/src/brightdata/datasets/indeed/companies.py index 5f9cf29..13b9a3f 100644 --- a/src/brightdata/datasets/indeed/companies.py +++ b/src/brightdata/datasets/indeed/companies.py @@ -12,7 +12,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine # Field categories diff --git a/src/brightdata/datasets/indeed/jobs.py b/src/brightdata/datasets/indeed/jobs.py index d27adc9..702bb41 100644 --- a/src/brightdata/datasets/indeed/jobs.py +++ b/src/brightdata/datasets/indeed/jobs.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class IndeedJobs(BaseDataset): diff --git a/src/brightdata/datasets/infocasas/properties.py b/src/brightdata/datasets/infocasas/properties.py index 033f995..8118959 100644 --- a/src/brightdata/datasets/infocasas/properties.py +++ b/src/brightdata/datasets/infocasas/properties.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class InfocasasUruguay(BaseDataset): diff --git a/src/brightdata/datasets/inmuebles24/properties.py b/src/brightdata/datasets/inmuebles24/properties.py index 6abdbae..055b23a 100644 --- a/src/brightdata/datasets/inmuebles24/properties.py +++ b/src/brightdata/datasets/inmuebles24/properties.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class Inmuebles24Mexico(BaseDataset): diff --git a/src/brightdata/datasets/instagram/comments.py b/src/brightdata/datasets/instagram/comments.py index ea90da7..37517aa 100644 --- a/src/brightdata/datasets/instagram/comments.py +++ b/src/brightdata/datasets/instagram/comments.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class InstagramComments(BaseDataset): diff --git a/src/brightdata/datasets/instagram/posts.py b/src/brightdata/datasets/instagram/posts.py index f32d605..290afef 100644 --- a/src/brightdata/datasets/instagram/posts.py +++ b/src/brightdata/datasets/instagram/posts.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class InstagramPosts(BaseDataset): diff --git a/src/brightdata/datasets/instagram/profiles.py b/src/brightdata/datasets/instagram/profiles.py index 800e79d..c3ea260 100644 --- a/src/brightdata/datasets/instagram/profiles.py +++ b/src/brightdata/datasets/instagram/profiles.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class InstagramProfiles(BaseDataset): diff --git a/src/brightdata/datasets/instagram/reels.py b/src/brightdata/datasets/instagram/reels.py index 78135a6..f064ccc 100644 --- a/src/brightdata/datasets/instagram/reels.py +++ b/src/brightdata/datasets/instagram/reels.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class InstagramReels(BaseDataset): diff --git a/src/brightdata/datasets/kroger/products.py b/src/brightdata/datasets/kroger/products.py index 1a7cca0..986e712 100644 --- a/src/brightdata/datasets/kroger/products.py +++ b/src/brightdata/datasets/kroger/products.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class KrogerProducts(BaseDataset): diff --git a/src/brightdata/datasets/lawyers/us_lawyers.py b/src/brightdata/datasets/lawyers/us_lawyers.py index db9d8fc..ccde4bf 100644 --- a/src/brightdata/datasets/lawyers/us_lawyers.py +++ b/src/brightdata/datasets/lawyers/us_lawyers.py @@ -12,7 +12,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine # Field categories diff --git a/src/brightdata/datasets/lazada/products.py b/src/brightdata/datasets/lazada/products.py index b883aa5..fa17377 100644 --- a/src/brightdata/datasets/lazada/products.py +++ b/src/brightdata/datasets/lazada/products.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class LazadaProducts(BaseDataset): diff --git a/src/brightdata/datasets/lazada/products_search.py b/src/brightdata/datasets/lazada/products_search.py index 3d36d04..69d23b8 100644 --- a/src/brightdata/datasets/lazada/products_search.py +++ b/src/brightdata/datasets/lazada/products_search.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class LazadaProductsSearch(BaseDataset): diff --git a/src/brightdata/datasets/lazada/reviews.py b/src/brightdata/datasets/lazada/reviews.py index dedcdf2..14af421 100644 --- a/src/brightdata/datasets/lazada/reviews.py +++ b/src/brightdata/datasets/lazada/reviews.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class LazadaReviews(BaseDataset): diff --git a/src/brightdata/datasets/lazboy/products.py b/src/brightdata/datasets/lazboy/products.py index 67026e3..784921a 100644 --- a/src/brightdata/datasets/lazboy/products.py +++ b/src/brightdata/datasets/lazboy/products.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class LaZBoyProducts(BaseDataset): diff --git a/src/brightdata/datasets/lego/products.py b/src/brightdata/datasets/lego/products.py index 626f69f..30a5ffc 100644 --- a/src/brightdata/datasets/lego/products.py +++ b/src/brightdata/datasets/lego/products.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class LegoProducts(BaseDataset): diff --git a/src/brightdata/datasets/linkedin/company_profiles.py b/src/brightdata/datasets/linkedin/company_profiles.py index a4c4bf2..daefbd7 100644 --- a/src/brightdata/datasets/linkedin/company_profiles.py +++ b/src/brightdata/datasets/linkedin/company_profiles.py @@ -12,7 +12,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class LinkedInCompanyProfiles(BaseDataset): diff --git a/src/brightdata/datasets/linkedin/job_listings.py b/src/brightdata/datasets/linkedin/job_listings.py index 7b01f40..fcde0d0 100644 --- a/src/brightdata/datasets/linkedin/job_listings.py +++ b/src/brightdata/datasets/linkedin/job_listings.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine # Field categories diff --git a/src/brightdata/datasets/linkedin/people_profiles.py b/src/brightdata/datasets/linkedin/people_profiles.py index 6c90bca..34bc07f 100644 --- a/src/brightdata/datasets/linkedin/people_profiles.py +++ b/src/brightdata/datasets/linkedin/people_profiles.py @@ -12,7 +12,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class LinkedInPeopleProfiles(BaseDataset): diff --git a/src/brightdata/datasets/linkedin/posts.py b/src/brightdata/datasets/linkedin/posts.py index 38980b2..32acf94 100644 --- a/src/brightdata/datasets/linkedin/posts.py +++ b/src/brightdata/datasets/linkedin/posts.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class LinkedInPosts(BaseDataset): diff --git a/src/brightdata/datasets/linkedin/profiles_job_listings.py b/src/brightdata/datasets/linkedin/profiles_job_listings.py index 97e0e1e..ff6356d 100644 --- a/src/brightdata/datasets/linkedin/profiles_job_listings.py +++ b/src/brightdata/datasets/linkedin/profiles_job_listings.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class LinkedInProfilesJobListings(BaseDataset): diff --git a/src/brightdata/datasets/llbean/products.py b/src/brightdata/datasets/llbean/products.py index 106e507..e123a6c 100644 --- a/src/brightdata/datasets/llbean/products.py +++ b/src/brightdata/datasets/llbean/products.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class LLBeanProducts(BaseDataset): diff --git a/src/brightdata/datasets/loewe/products.py b/src/brightdata/datasets/loewe/products.py index d881a19..39f3ecc 100644 --- a/src/brightdata/datasets/loewe/products.py +++ b/src/brightdata/datasets/loewe/products.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class LoeweProducts(BaseDataset): diff --git a/src/brightdata/datasets/lowes/products.py b/src/brightdata/datasets/lowes/products.py index fd41c76..108b278 100644 --- a/src/brightdata/datasets/lowes/products.py +++ b/src/brightdata/datasets/lowes/products.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class LowesProducts(BaseDataset): diff --git a/src/brightdata/datasets/macys/products.py b/src/brightdata/datasets/macys/products.py index 8c862a2..de3ed15 100644 --- a/src/brightdata/datasets/macys/products.py +++ b/src/brightdata/datasets/macys/products.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class MacysProducts(BaseDataset): diff --git a/src/brightdata/datasets/mango/products.py b/src/brightdata/datasets/mango/products.py index 274e96e..43138b4 100644 --- a/src/brightdata/datasets/mango/products.py +++ b/src/brightdata/datasets/mango/products.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class MangoProducts(BaseDataset): diff --git a/src/brightdata/datasets/manta/businesses.py b/src/brightdata/datasets/manta/businesses.py index 4c1d171..861df55 100644 --- a/src/brightdata/datasets/manta/businesses.py +++ b/src/brightdata/datasets/manta/businesses.py @@ -12,7 +12,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine # Field categories diff --git a/src/brightdata/datasets/massimo_dutti/products.py b/src/brightdata/datasets/massimo_dutti/products.py index 5a8d70e..4839123 100644 --- a/src/brightdata/datasets/massimo_dutti/products.py +++ b/src/brightdata/datasets/massimo_dutti/products.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class MassimoDuttiProducts(BaseDataset): diff --git a/src/brightdata/datasets/mattressfirm/products.py b/src/brightdata/datasets/mattressfirm/products.py index f27f434..7429663 100644 --- a/src/brightdata/datasets/mattressfirm/products.py +++ b/src/brightdata/datasets/mattressfirm/products.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class MattressfirmProducts(BaseDataset): diff --git a/src/brightdata/datasets/mediamarkt/products.py b/src/brightdata/datasets/mediamarkt/products.py index 5c3a151..539b19e 100644 --- a/src/brightdata/datasets/mediamarkt/products.py +++ b/src/brightdata/datasets/mediamarkt/products.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class MediamarktProducts(BaseDataset): diff --git a/src/brightdata/datasets/mercadolivre/products.py b/src/brightdata/datasets/mercadolivre/products.py index 948a7c8..67188cc 100644 --- a/src/brightdata/datasets/mercadolivre/products.py +++ b/src/brightdata/datasets/mercadolivre/products.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class MercadolivreProducts(BaseDataset): diff --git a/src/brightdata/datasets/metrocuadrado/properties.py b/src/brightdata/datasets/metrocuadrado/properties.py index 2dd75ad..80e1406 100644 --- a/src/brightdata/datasets/metrocuadrado/properties.py +++ b/src/brightdata/datasets/metrocuadrado/properties.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class MetrocuadradoProperties(BaseDataset): diff --git a/src/brightdata/datasets/microcenter/products.py b/src/brightdata/datasets/microcenter/products.py index c377913..117cd5b 100644 --- a/src/brightdata/datasets/microcenter/products.py +++ b/src/brightdata/datasets/microcenter/products.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class MicroCenterProducts(BaseDataset): diff --git a/src/brightdata/datasets/montblanc/products.py b/src/brightdata/datasets/montblanc/products.py index afacc99..e5c3d87 100644 --- a/src/brightdata/datasets/montblanc/products.py +++ b/src/brightdata/datasets/montblanc/products.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class MontblancProducts(BaseDataset): diff --git a/src/brightdata/datasets/mouser/products.py b/src/brightdata/datasets/mouser/products.py index 0a42a60..4d07998 100644 --- a/src/brightdata/datasets/mouser/products.py +++ b/src/brightdata/datasets/mouser/products.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class MouserProducts(BaseDataset): diff --git a/src/brightdata/datasets/moynat/products.py b/src/brightdata/datasets/moynat/products.py index 069dce9..eade7d4 100644 --- a/src/brightdata/datasets/moynat/products.py +++ b/src/brightdata/datasets/moynat/products.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class MoynatProducts(BaseDataset): diff --git a/src/brightdata/datasets/mybobs/products.py b/src/brightdata/datasets/mybobs/products.py index 9029dff..8d6075a 100644 --- a/src/brightdata/datasets/mybobs/products.py +++ b/src/brightdata/datasets/mybobs/products.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class MybobsProducts(BaseDataset): diff --git a/src/brightdata/datasets/myntra/products.py b/src/brightdata/datasets/myntra/products.py index adc5702..f252198 100644 --- a/src/brightdata/datasets/myntra/products.py +++ b/src/brightdata/datasets/myntra/products.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class MyntraProducts(BaseDataset): diff --git a/src/brightdata/datasets/naver/products.py b/src/brightdata/datasets/naver/products.py index 804daa6..8e0e4eb 100644 --- a/src/brightdata/datasets/naver/products.py +++ b/src/brightdata/datasets/naver/products.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class NaverProducts(BaseDataset): diff --git a/src/brightdata/datasets/nba/players_stats.py b/src/brightdata/datasets/nba/players_stats.py index b4f5ca5..19b6cf7 100644 --- a/src/brightdata/datasets/nba/players_stats.py +++ b/src/brightdata/datasets/nba/players_stats.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class NBAPlayersStats(BaseDataset): diff --git a/src/brightdata/datasets/olx/ads.py b/src/brightdata/datasets/olx/ads.py index b150502..ea33529 100644 --- a/src/brightdata/datasets/olx/ads.py +++ b/src/brightdata/datasets/olx/ads.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class OLXBrazil(BaseDataset): diff --git a/src/brightdata/datasets/otodom/properties.py b/src/brightdata/datasets/otodom/properties.py index a165106..5f0104a 100644 --- a/src/brightdata/datasets/otodom/properties.py +++ b/src/brightdata/datasets/otodom/properties.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class OtodomPoland(BaseDataset): diff --git a/src/brightdata/datasets/owler/companies.py b/src/brightdata/datasets/owler/companies.py index 6422d2b..76cd6a6 100644 --- a/src/brightdata/datasets/owler/companies.py +++ b/src/brightdata/datasets/owler/companies.py @@ -12,7 +12,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine # Field categories diff --git a/src/brightdata/datasets/ozon/products.py b/src/brightdata/datasets/ozon/products.py index 9b1fcab..d3a86e5 100644 --- a/src/brightdata/datasets/ozon/products.py +++ b/src/brightdata/datasets/ozon/products.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class OzonProducts(BaseDataset): diff --git a/src/brightdata/datasets/pinterest/posts.py b/src/brightdata/datasets/pinterest/posts.py index 66270da..43666d5 100644 --- a/src/brightdata/datasets/pinterest/posts.py +++ b/src/brightdata/datasets/pinterest/posts.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class PinterestPosts(BaseDataset): diff --git a/src/brightdata/datasets/pinterest/profiles.py b/src/brightdata/datasets/pinterest/profiles.py index b0fad50..a94c9e6 100644 --- a/src/brightdata/datasets/pinterest/profiles.py +++ b/src/brightdata/datasets/pinterest/profiles.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class PinterestProfiles(BaseDataset): diff --git a/src/brightdata/datasets/pitchbook/companies.py b/src/brightdata/datasets/pitchbook/companies.py index 41afd2b..0aae293 100644 --- a/src/brightdata/datasets/pitchbook/companies.py +++ b/src/brightdata/datasets/pitchbook/companies.py @@ -12,7 +12,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine # Field categories diff --git a/src/brightdata/datasets/prada/products.py b/src/brightdata/datasets/prada/products.py index e5b2f4d..b6c0698 100644 --- a/src/brightdata/datasets/prada/products.py +++ b/src/brightdata/datasets/prada/products.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class PradaProducts(BaseDataset): diff --git a/src/brightdata/datasets/properati/properties.py b/src/brightdata/datasets/properati/properties.py index 34d8a1c..dd59812 100644 --- a/src/brightdata/datasets/properati/properties.py +++ b/src/brightdata/datasets/properati/properties.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class ProperatiProperties(BaseDataset): diff --git a/src/brightdata/datasets/quora/posts.py b/src/brightdata/datasets/quora/posts.py index 992b1c0..1462a4a 100644 --- a/src/brightdata/datasets/quora/posts.py +++ b/src/brightdata/datasets/quora/posts.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class QuoraPosts(BaseDataset): diff --git a/src/brightdata/datasets/raymourflanigan/products.py b/src/brightdata/datasets/raymourflanigan/products.py index 664e886..0390e72 100644 --- a/src/brightdata/datasets/raymourflanigan/products.py +++ b/src/brightdata/datasets/raymourflanigan/products.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class RaymourFlaniganProducts(BaseDataset): diff --git a/src/brightdata/datasets/real_estate/australia.py b/src/brightdata/datasets/real_estate/australia.py index f5bfe86..42c7bb9 100644 --- a/src/brightdata/datasets/real_estate/australia.py +++ b/src/brightdata/datasets/real_estate/australia.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class AustraliaRealEstate(BaseDataset): diff --git a/src/brightdata/datasets/realtor/international_properties.py b/src/brightdata/datasets/realtor/international_properties.py index 47987b6..f8dde86 100644 --- a/src/brightdata/datasets/realtor/international_properties.py +++ b/src/brightdata/datasets/realtor/international_properties.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class RealtorInternationalProperties(BaseDataset): diff --git a/src/brightdata/datasets/reddit/comments.py b/src/brightdata/datasets/reddit/comments.py index bf1d807..3b2342f 100644 --- a/src/brightdata/datasets/reddit/comments.py +++ b/src/brightdata/datasets/reddit/comments.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class RedditComments(BaseDataset): diff --git a/src/brightdata/datasets/reddit/posts.py b/src/brightdata/datasets/reddit/posts.py index 6d0401f..c9283a4 100644 --- a/src/brightdata/datasets/reddit/posts.py +++ b/src/brightdata/datasets/reddit/posts.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class RedditPosts(BaseDataset): diff --git a/src/brightdata/datasets/rona/products.py b/src/brightdata/datasets/rona/products.py index 089e739..a0ee538 100644 --- a/src/brightdata/datasets/rona/products.py +++ b/src/brightdata/datasets/rona/products.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class RonaProducts(BaseDataset): diff --git a/src/brightdata/datasets/sephora/products.py b/src/brightdata/datasets/sephora/products.py index f901f43..61410d4 100644 --- a/src/brightdata/datasets/sephora/products.py +++ b/src/brightdata/datasets/sephora/products.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class SephoraProducts(BaseDataset): diff --git a/src/brightdata/datasets/shein/products.py b/src/brightdata/datasets/shein/products.py index 3f61959..800f822 100644 --- a/src/brightdata/datasets/shein/products.py +++ b/src/brightdata/datasets/shein/products.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class SheinProducts(BaseDataset): diff --git a/src/brightdata/datasets/shopee/products.py b/src/brightdata/datasets/shopee/products.py index 7fdc62a..6b32b66 100644 --- a/src/brightdata/datasets/shopee/products.py +++ b/src/brightdata/datasets/shopee/products.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class ShopeeProducts(BaseDataset): diff --git a/src/brightdata/datasets/sleepnumber/products.py b/src/brightdata/datasets/sleepnumber/products.py index d171442..8584d57 100644 --- a/src/brightdata/datasets/sleepnumber/products.py +++ b/src/brightdata/datasets/sleepnumber/products.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class SleepNumberProducts(BaseDataset): diff --git a/src/brightdata/datasets/slintel/companies.py b/src/brightdata/datasets/slintel/companies.py index 6115b4e..63ebbe1 100644 --- a/src/brightdata/datasets/slintel/companies.py +++ b/src/brightdata/datasets/slintel/companies.py @@ -12,7 +12,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine # Field categories diff --git a/src/brightdata/datasets/snapchat/posts.py b/src/brightdata/datasets/snapchat/posts.py index 695ffc5..10bc728 100644 --- a/src/brightdata/datasets/snapchat/posts.py +++ b/src/brightdata/datasets/snapchat/posts.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class SnapchatPosts(BaseDataset): diff --git a/src/brightdata/datasets/tiktok/comments.py b/src/brightdata/datasets/tiktok/comments.py index f926d9d..1fa22a4 100644 --- a/src/brightdata/datasets/tiktok/comments.py +++ b/src/brightdata/datasets/tiktok/comments.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class TikTokComments(BaseDataset): diff --git a/src/brightdata/datasets/tiktok/posts.py b/src/brightdata/datasets/tiktok/posts.py index 3158222..8b9fd7b 100644 --- a/src/brightdata/datasets/tiktok/posts.py +++ b/src/brightdata/datasets/tiktok/posts.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class TikTokPosts(BaseDataset): diff --git a/src/brightdata/datasets/tiktok/profiles.py b/src/brightdata/datasets/tiktok/profiles.py index f3164df..e94e657 100644 --- a/src/brightdata/datasets/tiktok/profiles.py +++ b/src/brightdata/datasets/tiktok/profiles.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class TikTokProfiles(BaseDataset): diff --git a/src/brightdata/datasets/tiktok/shop.py b/src/brightdata/datasets/tiktok/shop.py index 6b90747..0cf46f6 100644 --- a/src/brightdata/datasets/tiktok/shop.py +++ b/src/brightdata/datasets/tiktok/shop.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class TikTokShop(BaseDataset): diff --git a/src/brightdata/datasets/toctoc/properties.py b/src/brightdata/datasets/toctoc/properties.py index 9937c33..4eeb34c 100644 --- a/src/brightdata/datasets/toctoc/properties.py +++ b/src/brightdata/datasets/toctoc/properties.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class ToctocProperties(BaseDataset): diff --git a/src/brightdata/datasets/tokopedia/products.py b/src/brightdata/datasets/tokopedia/products.py index 8856fcd..bcd33b5 100644 --- a/src/brightdata/datasets/tokopedia/products.py +++ b/src/brightdata/datasets/tokopedia/products.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class TokopediaProducts(BaseDataset): diff --git a/src/brightdata/datasets/toysrus/products.py b/src/brightdata/datasets/toysrus/products.py index fbe4cf0..f0ad329 100644 --- a/src/brightdata/datasets/toysrus/products.py +++ b/src/brightdata/datasets/toysrus/products.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class ToysRUsProducts(BaseDataset): diff --git a/src/brightdata/datasets/trustpilot/reviews.py b/src/brightdata/datasets/trustpilot/reviews.py index 190e9fe..25672d1 100644 --- a/src/brightdata/datasets/trustpilot/reviews.py +++ b/src/brightdata/datasets/trustpilot/reviews.py @@ -12,7 +12,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine # Field categories diff --git a/src/brightdata/datasets/trustradius/reviews.py b/src/brightdata/datasets/trustradius/reviews.py index 4e0cae4..ed3f18e 100644 --- a/src/brightdata/datasets/trustradius/reviews.py +++ b/src/brightdata/datasets/trustradius/reviews.py @@ -12,7 +12,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine # Field categories diff --git a/src/brightdata/datasets/ventureradar/companies.py b/src/brightdata/datasets/ventureradar/companies.py index dc83f50..d70f5ef 100644 --- a/src/brightdata/datasets/ventureradar/companies.py +++ b/src/brightdata/datasets/ventureradar/companies.py @@ -12,7 +12,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine # Field categories diff --git a/src/brightdata/datasets/vimeo/videos.py b/src/brightdata/datasets/vimeo/videos.py index 1a00204..6c226f1 100644 --- a/src/brightdata/datasets/vimeo/videos.py +++ b/src/brightdata/datasets/vimeo/videos.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class VimeoVideos(BaseDataset): diff --git a/src/brightdata/datasets/walmart/products.py b/src/brightdata/datasets/walmart/products.py index 53f4cc2..c6d8f57 100644 --- a/src/brightdata/datasets/walmart/products.py +++ b/src/brightdata/datasets/walmart/products.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class WalmartProducts(BaseDataset): diff --git a/src/brightdata/datasets/walmart/sellers.py b/src/brightdata/datasets/walmart/sellers.py index 65f9884..3dbb556 100644 --- a/src/brightdata/datasets/walmart/sellers.py +++ b/src/brightdata/datasets/walmart/sellers.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class WalmartSellersInfo(BaseDataset): diff --git a/src/brightdata/datasets/wayfair/products.py b/src/brightdata/datasets/wayfair/products.py index a67cdae..edd1df1 100644 --- a/src/brightdata/datasets/wayfair/products.py +++ b/src/brightdata/datasets/wayfair/products.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class WayfairProducts(BaseDataset): diff --git a/src/brightdata/datasets/webmotors/vehicles.py b/src/brightdata/datasets/webmotors/vehicles.py index 443570c..3b72aa4 100644 --- a/src/brightdata/datasets/webmotors/vehicles.py +++ b/src/brightdata/datasets/webmotors/vehicles.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class WebmotorsBrasil(BaseDataset): diff --git a/src/brightdata/datasets/wikipedia/articles.py b/src/brightdata/datasets/wikipedia/articles.py index 49c4db5..607f1b3 100644 --- a/src/brightdata/datasets/wikipedia/articles.py +++ b/src/brightdata/datasets/wikipedia/articles.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class WikipediaArticles(BaseDataset): diff --git a/src/brightdata/datasets/wildberries/products.py b/src/brightdata/datasets/wildberries/products.py index 5f3b40e..52f13d8 100644 --- a/src/brightdata/datasets/wildberries/products.py +++ b/src/brightdata/datasets/wildberries/products.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class WildberriesProducts(BaseDataset): diff --git a/src/brightdata/datasets/world_population/countries.py b/src/brightdata/datasets/world_population/countries.py index 44833b7..a406b23 100644 --- a/src/brightdata/datasets/world_population/countries.py +++ b/src/brightdata/datasets/world_population/countries.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class WorldPopulation(BaseDataset): diff --git a/src/brightdata/datasets/world_zipcodes/zipcodes.py b/src/brightdata/datasets/world_zipcodes/zipcodes.py index f406a26..81e4a5e 100644 --- a/src/brightdata/datasets/world_zipcodes/zipcodes.py +++ b/src/brightdata/datasets/world_zipcodes/zipcodes.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class WorldZipcodes(BaseDataset): diff --git a/src/brightdata/datasets/x_twitter/posts.py b/src/brightdata/datasets/x_twitter/posts.py index 4c5cdb4..4cc05d3 100644 --- a/src/brightdata/datasets/x_twitter/posts.py +++ b/src/brightdata/datasets/x_twitter/posts.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class XTwitterPosts(BaseDataset): diff --git a/src/brightdata/datasets/x_twitter/profiles.py b/src/brightdata/datasets/x_twitter/profiles.py index 6da2550..c5e4f2b 100644 --- a/src/brightdata/datasets/x_twitter/profiles.py +++ b/src/brightdata/datasets/x_twitter/profiles.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class XTwitterProfiles(BaseDataset): diff --git a/src/brightdata/datasets/xing/profiles.py b/src/brightdata/datasets/xing/profiles.py index 8469841..04ed868 100644 --- a/src/brightdata/datasets/xing/profiles.py +++ b/src/brightdata/datasets/xing/profiles.py @@ -12,7 +12,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine # Field categories diff --git a/src/brightdata/datasets/yahoo_finance/businesses.py b/src/brightdata/datasets/yahoo_finance/businesses.py index 8ccbfae..2aef7f7 100644 --- a/src/brightdata/datasets/yahoo_finance/businesses.py +++ b/src/brightdata/datasets/yahoo_finance/businesses.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class YahooFinanceBusinesses(BaseDataset): diff --git a/src/brightdata/datasets/yapo/ads.py b/src/brightdata/datasets/yapo/ads.py index b19d8c2..b666fd8 100644 --- a/src/brightdata/datasets/yapo/ads.py +++ b/src/brightdata/datasets/yapo/ads.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class YapoChile(BaseDataset): diff --git a/src/brightdata/datasets/yelp/businesses.py b/src/brightdata/datasets/yelp/businesses.py index b7f26ba..d350fb7 100644 --- a/src/brightdata/datasets/yelp/businesses.py +++ b/src/brightdata/datasets/yelp/businesses.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine # Field categories diff --git a/src/brightdata/datasets/yelp/reviews.py b/src/brightdata/datasets/yelp/reviews.py index 3a9f04e..f8355c4 100644 --- a/src/brightdata/datasets/yelp/reviews.py +++ b/src/brightdata/datasets/yelp/reviews.py @@ -12,7 +12,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine # Field categories diff --git a/src/brightdata/datasets/youtube/comments.py b/src/brightdata/datasets/youtube/comments.py index 81867aa..28005c2 100644 --- a/src/brightdata/datasets/youtube/comments.py +++ b/src/brightdata/datasets/youtube/comments.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class YouTubeComments(BaseDataset): diff --git a/src/brightdata/datasets/youtube/profiles.py b/src/brightdata/datasets/youtube/profiles.py index 35379f8..164a98b 100644 --- a/src/brightdata/datasets/youtube/profiles.py +++ b/src/brightdata/datasets/youtube/profiles.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class YouTubeProfiles(BaseDataset): diff --git a/src/brightdata/datasets/youtube/videos.py b/src/brightdata/datasets/youtube/videos.py index 12de2f9..5e9ec56 100644 --- a/src/brightdata/datasets/youtube/videos.py +++ b/src/brightdata/datasets/youtube/videos.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class YouTubeVideos(BaseDataset): diff --git a/src/brightdata/datasets/ysl/products.py b/src/brightdata/datasets/ysl/products.py index 3ca73e6..c7f4b73 100644 --- a/src/brightdata/datasets/ysl/products.py +++ b/src/brightdata/datasets/ysl/products.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class YSLProducts(BaseDataset): diff --git a/src/brightdata/datasets/zalando/products.py b/src/brightdata/datasets/zalando/products.py index 8708d4e..a214632 100644 --- a/src/brightdata/datasets/zalando/products.py +++ b/src/brightdata/datasets/zalando/products.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class ZalandoProducts(BaseDataset): diff --git a/src/brightdata/datasets/zara/home_products.py b/src/brightdata/datasets/zara/home_products.py index 32f340c..28d3c19 100644 --- a/src/brightdata/datasets/zara/home_products.py +++ b/src/brightdata/datasets/zara/home_products.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class ZaraHomeProducts(BaseDataset): diff --git a/src/brightdata/datasets/zara/products.py b/src/brightdata/datasets/zara/products.py index 5cd494c..978c782 100644 --- a/src/brightdata/datasets/zara/products.py +++ b/src/brightdata/datasets/zara/products.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class ZaraProducts(BaseDataset): diff --git a/src/brightdata/datasets/zillow/price_history.py b/src/brightdata/datasets/zillow/price_history.py index 37df2f2..e0e31a3 100644 --- a/src/brightdata/datasets/zillow/price_history.py +++ b/src/brightdata/datasets/zillow/price_history.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class ZillowPriceHistory(BaseDataset): diff --git a/src/brightdata/datasets/zillow/properties.py b/src/brightdata/datasets/zillow/properties.py index 72c594b..7107ded 100644 --- a/src/brightdata/datasets/zillow/properties.py +++ b/src/brightdata/datasets/zillow/properties.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class ZillowProperties(BaseDataset): diff --git a/src/brightdata/datasets/zonaprop/properties.py b/src/brightdata/datasets/zonaprop/properties.py index ba29f54..bae3a73 100644 --- a/src/brightdata/datasets/zonaprop/properties.py +++ b/src/brightdata/datasets/zonaprop/properties.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class ZonapropArgentina(BaseDataset): diff --git a/src/brightdata/datasets/zoominfo/companies.py b/src/brightdata/datasets/zoominfo/companies.py index 25fe578..b23c72a 100644 --- a/src/brightdata/datasets/zoominfo/companies.py +++ b/src/brightdata/datasets/zoominfo/companies.py @@ -12,7 +12,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine # Field categories diff --git a/src/brightdata/datasets/zoopla/properties.py b/src/brightdata/datasets/zoopla/properties.py index 2038690..5cd299e 100644 --- a/src/brightdata/datasets/zoopla/properties.py +++ b/src/brightdata/datasets/zoopla/properties.py @@ -11,7 +11,7 @@ from ..base import BaseDataset if TYPE_CHECKING: - from ...core.async_engine import AsyncEngine + from ...core.engine import AsyncEngine class ZooplaProperties(BaseDataset): diff --git a/src/brightdata/exceptions/__init__.py b/src/brightdata/exceptions/__init__.py index afb021a..9974381 100644 --- a/src/brightdata/exceptions/__init__.py +++ b/src/brightdata/exceptions/__init__.py @@ -6,7 +6,6 @@ AuthenticationError, APIError, DataNotReadyError, - TimeoutError, ZoneError, NetworkError, SSLError, @@ -18,7 +17,6 @@ "AuthenticationError", "APIError", "DataNotReadyError", - "TimeoutError", "ZoneError", "NetworkError", "SSLError", diff --git a/src/brightdata/exceptions/errors.py b/src/brightdata/exceptions/errors.py index e16d2cc..fc476d3 100644 --- a/src/brightdata/exceptions/errors.py +++ b/src/brightdata/exceptions/errors.py @@ -45,12 +45,6 @@ class DataNotReadyError(BrightDataError): pass -class TimeoutError(BrightDataError): - """Operation timed out.""" - - pass - - class ZoneError(BrightDataError): """Zone operation failed.""" diff --git a/src/brightdata/protocols.py b/src/brightdata/protocols.py deleted file mode 100644 index 0c8ad8b..0000000 --- a/src/brightdata/protocols.py +++ /dev/null @@ -1 +0,0 @@ -"""Interface definitions (typing.Protocol).""" diff --git a/src/brightdata/scraper_studio/__init__.py b/src/brightdata/scraper_studio/__init__.py index 54f7c7f..bedfc5a 100644 --- a/src/brightdata/scraper_studio/__init__.py +++ b/src/brightdata/scraper_studio/__init__.py @@ -1,5 +1,6 @@ """Scraper Studio - trigger and fetch results from user-created custom scrapers.""" from .models import ScraperStudioJob, JobStatus +from .service import ScraperStudioService -__all__ = ["ScraperStudioJob", "JobStatus"] +__all__ = ["ScraperStudioJob", "JobStatus", "ScraperStudioService"] diff --git a/src/brightdata/scraper_studio/client.py b/src/brightdata/scraper_studio/client.py index 2138025..ad9f8c5 100644 --- a/src/brightdata/scraper_studio/client.py +++ b/src/brightdata/scraper_studio/client.py @@ -12,7 +12,7 @@ from typing import Dict, List, Any from ..core.engine import AsyncEngine -from ..constants import HTTP_OK, HTTP_ACCEPTED +from http import HTTPStatus from ..exceptions import APIError, DataNotReadyError @@ -63,7 +63,7 @@ async def trigger_immediate( params = {"collector": collector} async with self.engine.post_to_url(url, json_data=input, params=params) as response: - if response.status in (HTTP_OK, HTTP_ACCEPTED): + if response.status in (HTTPStatus.OK, HTTPStatus.ACCEPTED): data = await response.json() response_id = data.get("response_id") if not response_id: @@ -97,7 +97,7 @@ async def fetch_immediate_result( params = {"response_id": response_id} async with self.engine.get_from_url(url, params=params) as response: - if response.status == HTTP_OK: + if response.status == HTTPStatus.OK: return await response.json() elif response.status == 202: raise DataNotReadyError(f"Data not ready for response_id={response_id}") @@ -127,7 +127,7 @@ async def get_status( url = f"{BASE_URL}/dca/log/{job_id}" async with self.engine.get_from_url(url) as response: - if response.status == HTTP_OK: + if response.status == HTTPStatus.OK: return await response.json() else: error_text = await response.text() diff --git a/src/brightdata/api/scraper_studio_service.py b/src/brightdata/scraper_studio/service.py similarity index 97% rename from src/brightdata/api/scraper_studio_service.py rename to src/brightdata/scraper_studio/service.py index c853e1e..27b0e3c 100644 --- a/src/brightdata/api/scraper_studio_service.py +++ b/src/brightdata/scraper_studio/service.py @@ -9,8 +9,8 @@ from typing import Dict, List, Any, Union, TYPE_CHECKING -from ..scraper_studio.client import ScraperStudioAPIClient -from ..scraper_studio.models import ScraperStudioJob, JobStatus +from .client import ScraperStudioAPIClient +from .models import ScraperStudioJob, JobStatus from ..constants import SCRAPER_STUDIO_DEFAULT_TIMEOUT, SCRAPER_STUDIO_POLL_INTERVAL if TYPE_CHECKING: diff --git a/src/brightdata/scrapers/__init__.py b/src/brightdata/scrapers/__init__.py index c334eca..f01ea4e 100644 --- a/src/brightdata/scrapers/__init__.py +++ b/src/brightdata/scrapers/__init__.py @@ -60,6 +60,16 @@ except ImportError: YouTubeSearchScraper = None +try: + from .digikey.scraper import DigiKeyScraper +except ImportError: + DigiKeyScraper = None + +try: + from .reddit.scraper import RedditScraper +except ImportError: + RedditScraper = None + __all__ = [ "BaseWebScraper", @@ -79,4 +89,6 @@ "TikTokSearchScraper", "YouTubeScraper", "YouTubeSearchScraper", + "DigiKeyScraper", + "RedditScraper", ] diff --git a/src/brightdata/scrapers/api_client.py b/src/brightdata/scrapers/api_client.py index 099f164..49f4ae8 100644 --- a/src/brightdata/scrapers/api_client.py +++ b/src/brightdata/scrapers/api_client.py @@ -10,7 +10,7 @@ from typing import List, Dict, Any, Optional from ..core.engine import AsyncEngine -from ..constants import HTTP_OK +from http import HTTPStatus from ..exceptions import APIError, DataNotReadyError @@ -78,7 +78,7 @@ async def trigger( async with self.engine.post_to_url( self.TRIGGER_URL, json_data=payload, params=params ) as response: - if response.status == HTTP_OK: + if response.status == HTTPStatus.OK: data = await response.json() return data.get("snapshot_id") else: @@ -101,7 +101,7 @@ async def get_status(self, snapshot_id: str) -> str: url = f"{self.STATUS_URL}/{snapshot_id}" async with self.engine.get_from_url(url) as response: - if response.status == HTTP_OK: + if response.status == HTTPStatus.OK: data = await response.json() return data.get("status", "unknown") else: @@ -125,7 +125,7 @@ async def fetch_result(self, snapshot_id: str, format: str = "json") -> Any: params = {"format": format} async with self.engine.get_from_url(url, params=params) as response: - if response.status == HTTP_OK: + if response.status == HTTPStatus.OK: if format == "json": return await response.json() else: diff --git a/src/brightdata/scrapers/digikey/__init__.py b/src/brightdata/scrapers/digikey/__init__.py new file mode 100644 index 0000000..2b3a55d --- /dev/null +++ b/src/brightdata/scrapers/digikey/__init__.py @@ -0,0 +1,3 @@ +from .scraper import DigiKeyScraper + +__all__ = ["DigiKeyScraper"] diff --git a/src/brightdata/scrapers/digikey/scraper.py b/src/brightdata/scrapers/digikey/scraper.py new file mode 100644 index 0000000..3ffeb98 --- /dev/null +++ b/src/brightdata/scrapers/digikey/scraper.py @@ -0,0 +1,252 @@ +""" +DigiKey scraper - URL-based collection and category discovery for electronic components. + +Supports: +- Products: collect by URL +- Category discovery: discover new parts by category URL + +API Specifications: +- client.scrape.digikey.products(url, ...) # async +- client.scrape.digikey.products_sync(url, ...) # sync +- client.scrape.digikey.discover_by_category(url, ...) # async +- client.scrape.digikey.discover_by_category_sync(url, ...) # sync +""" + +import asyncio +from typing import List, Any, Union + +from ..base import BaseWebScraper +from ..registry import register +from ..job import ScrapeJob +from ...models import ScrapeResult +from ...utils.validation import validate_url, validate_url_list +from ...utils.function_detection import get_caller_function_name +from ...constants import DEFAULT_POLL_INTERVAL, DEFAULT_TIMEOUT_MEDIUM, DEFAULT_COST_PER_RECORD + + +@register("digikey") +class DigiKeyScraper(BaseWebScraper): + """ + DigiKey scraper for electronic components. + + Extracts structured data from DigiKey URLs for: + - Products (collect by URL) + - Category discovery (discover new parts from category pages) + + Example: + >>> scraper = DigiKeyScraper(bearer_token="token") + >>> + >>> # Collect product data + >>> result = await scraper.products( + ... url="https://www.digikey.com/en/products/detail/..." + ... ) + >>> + >>> # Discover products by category + >>> result = await scraper.discover_by_category( + ... url="https://www.digikey.com/en/products/category/..." + ... ) + """ + + # Dataset ID (single dataset for all operations) + DATASET_ID = "gd_lj74waf72416ro0k65" + + PLATFORM_NAME = "digikey" + MIN_POLL_TIMEOUT = DEFAULT_TIMEOUT_MEDIUM + COST_PER_RECORD = DEFAULT_COST_PER_RECORD + + # ============================================================================ + # PRODUCTS - Collect by URL + # ============================================================================ + + async def products( + self, + url: Union[str, List[str]], + timeout: int = DEFAULT_TIMEOUT_MEDIUM, + ) -> Union[ScrapeResult, List[ScrapeResult]]: + """ + Collect DigiKey product data by URL (async). + + Args: + url: Product URL(s) like https://www.digikey.com/en/products/detail/... + timeout: Maximum wait time in seconds (default: 240) + + Returns: + ScrapeResult or List[ScrapeResult] with product data + + Example: + >>> result = await scraper.products( + ... url="https://www.digikey.com/en/products/detail/STMicroelectronics/STM32F407VGT6/2747117" + ... ) + >>> print(result.data) + """ + if isinstance(url, str): + validate_url(url) + else: + validate_url_list(url) + + is_single = isinstance(url, str) + url_list = [url] if is_single else url + payload = [{"url": u} for u in url_list] + + sdk_function = get_caller_function_name() + result = await self.workflow_executor.execute( + payload=payload, + dataset_id=self.DATASET_ID, + poll_interval=DEFAULT_POLL_INTERVAL, + poll_timeout=timeout, + include_errors=True, + sdk_function=sdk_function, + normalize_func=self.normalize_result, + ) + + if is_single and isinstance(result.data, list) and len(result.data) == 1: + result.url = url if isinstance(url, str) else url[0] + result.data = result.data[0] + return result + + def products_sync( + self, + url: Union[str, List[str]], + timeout: int = DEFAULT_TIMEOUT_MEDIUM, + ) -> Union[ScrapeResult, List[ScrapeResult]]: + """Collect DigiKey product data by URL (sync).""" + + async def _run(): + async with self.engine: + return await self.products(url, timeout) + + return asyncio.run(_run()) + + # --- Products Trigger/Status/Fetch --- + + async def products_trigger(self, url: Union[str, List[str]]) -> ScrapeJob: + """Trigger DigiKey products collection (manual control).""" + url_list = [url] if isinstance(url, str) else url + payload = [{"url": u} for u in url_list] + + snapshot_id = await self.api_client.trigger(payload=payload, dataset_id=self.DATASET_ID) + return ScrapeJob( + snapshot_id=snapshot_id, + api_client=self.api_client, + platform_name=self.PLATFORM_NAME, + cost_per_record=self.COST_PER_RECORD, + ) + + def products_trigger_sync(self, url: Union[str, List[str]]) -> ScrapeJob: + """Trigger DigiKey products collection (sync).""" + return asyncio.run(self.products_trigger(url)) + + async def products_status(self, snapshot_id: str) -> str: + """Check DigiKey products collection status.""" + return await self._check_status_async(snapshot_id) + + def products_status_sync(self, snapshot_id: str) -> str: + """Check DigiKey products collection status (sync).""" + return asyncio.run(self.products_status(snapshot_id)) + + async def products_fetch(self, snapshot_id: str) -> Any: + """Fetch DigiKey products results.""" + return await self._fetch_results_async(snapshot_id) + + def products_fetch_sync(self, snapshot_id: str) -> Any: + """Fetch DigiKey products results (sync).""" + return asyncio.run(self.products_fetch(snapshot_id)) + + # ============================================================================ + # DISCOVER BY CATEGORY + # ============================================================================ + + async def discover_by_category( + self, + url: Union[str, List[str]], + timeout: int = DEFAULT_TIMEOUT_MEDIUM, + ) -> ScrapeResult: + """ + Discover DigiKey products by category URL (async). + + Crawls category pages to discover new parts. + + Args: + url: Category URL(s) like https://www.digikey.com/en/products/category/... + timeout: Maximum wait time in seconds (default: 240) + + Returns: + ScrapeResult with discovered products + + Example: + >>> result = await scraper.discover_by_category( + ... url="https://www.digikey.com/en/products/category/integrated-circuits/36" + ... ) + """ + if isinstance(url, str): + validate_url(url) + else: + validate_url_list(url) + + url_list = [url] if isinstance(url, str) else url + payload = [{"category_url": u} for u in url_list] + + sdk_function = get_caller_function_name() + result = await self.workflow_executor.execute( + payload=payload, + dataset_id=self.DATASET_ID, + poll_interval=DEFAULT_POLL_INTERVAL, + poll_timeout=timeout, + include_errors=True, + sdk_function=sdk_function, + normalize_func=self.normalize_result, + extra_params={"type": "discover_new", "discover_by": "category"}, + ) + return result + + def discover_by_category_sync( + self, + url: Union[str, List[str]], + timeout: int = DEFAULT_TIMEOUT_MEDIUM, + ) -> ScrapeResult: + """Discover DigiKey products by category URL (sync).""" + + async def _run(): + async with self.engine: + return await self.discover_by_category(url, timeout) + + return asyncio.run(_run()) + + # --- Discover Trigger/Status/Fetch --- + + async def discover_by_category_trigger(self, url: Union[str, List[str]]) -> ScrapeJob: + """Trigger DigiKey category discovery (manual control).""" + url_list = [url] if isinstance(url, str) else url + payload = [{"category_url": u} for u in url_list] + + snapshot_id = await self.api_client.trigger( + payload=payload, + dataset_id=self.DATASET_ID, + extra_params={"type": "discover_new", "discover_by": "category"}, + ) + return ScrapeJob( + snapshot_id=snapshot_id, + api_client=self.api_client, + platform_name=self.PLATFORM_NAME, + cost_per_record=self.COST_PER_RECORD, + ) + + def discover_by_category_trigger_sync(self, url: Union[str, List[str]]) -> ScrapeJob: + """Trigger DigiKey category discovery (sync).""" + return asyncio.run(self.discover_by_category_trigger(url)) + + async def discover_by_category_status(self, snapshot_id: str) -> str: + """Check DigiKey category discovery status.""" + return await self._check_status_async(snapshot_id) + + def discover_by_category_status_sync(self, snapshot_id: str) -> str: + """Check DigiKey category discovery status (sync).""" + return asyncio.run(self.discover_by_category_status(snapshot_id)) + + async def discover_by_category_fetch(self, snapshot_id: str) -> Any: + """Fetch DigiKey category discovery results.""" + return await self._fetch_results_async(snapshot_id) + + def discover_by_category_fetch_sync(self, snapshot_id: str) -> Any: + """Fetch DigiKey category discovery results (sync).""" + return asyncio.run(self.discover_by_category_fetch(snapshot_id)) diff --git a/src/brightdata/scrapers/job.py b/src/brightdata/scrapers/job.py index 9534639..a4d079f 100644 --- a/src/brightdata/scrapers/job.py +++ b/src/brightdata/scrapers/job.py @@ -206,9 +206,9 @@ async def to_result( data=data, platform=self.platform_name, cost=estimated_cost, - timing_start=start_time, - timing_end=end_time, - metadata={"snapshot_id": self.snapshot_id}, + snapshot_id=self.snapshot_id, + trigger_sent_at=start_time, + data_fetched_at=end_time, ) except Exception as e: @@ -216,7 +216,7 @@ async def to_result( success=False, error=str(e), platform=self.platform_name, - timing_start=start_time, - timing_end=datetime.now(timezone.utc), - metadata={"snapshot_id": self.snapshot_id}, + snapshot_id=self.snapshot_id, + trigger_sent_at=start_time, + data_fetched_at=datetime.now(timezone.utc), ) diff --git a/src/brightdata/scrapers/reddit/__init__.py b/src/brightdata/scrapers/reddit/__init__.py new file mode 100644 index 0000000..b3d4e55 --- /dev/null +++ b/src/brightdata/scrapers/reddit/__init__.py @@ -0,0 +1,3 @@ +from .scraper import RedditScraper + +__all__ = ["RedditScraper"] diff --git a/src/brightdata/scrapers/reddit/scraper.py b/src/brightdata/scrapers/reddit/scraper.py new file mode 100644 index 0000000..3351cf6 --- /dev/null +++ b/src/brightdata/scrapers/reddit/scraper.py @@ -0,0 +1,504 @@ +""" +Reddit scraper - URL-based collection and discovery for posts and comments. + +Supports: +- Posts: collect by URL +- Posts: discover by keyword +- Posts: discover by subreddit URL +- Comments: collect by URL + +API Specifications: +- client.scrape.reddit.posts(url, ...) # async +- client.scrape.reddit.posts_sync(url, ...) # sync +- client.scrape.reddit.posts_by_keyword(keyword, ...) # async +- client.scrape.reddit.posts_by_keyword_sync(keyword, ...) # sync +- client.scrape.reddit.posts_by_subreddit(url, ...) # async +- client.scrape.reddit.posts_by_subreddit_sync(url, ...) # sync +- client.scrape.reddit.comments(url, ...) # async +- client.scrape.reddit.comments_sync(url, ...) # sync +""" + +import asyncio +from typing import List, Any, Optional, Union, Dict + +from ..base import BaseWebScraper +from ..registry import register +from ..job import ScrapeJob +from ...models import ScrapeResult +from ...utils.validation import validate_url, validate_url_list +from ...utils.function_detection import get_caller_function_name +from ...constants import DEFAULT_POLL_INTERVAL, DEFAULT_TIMEOUT_MEDIUM, DEFAULT_COST_PER_RECORD + + +@register("reddit") +class RedditScraper(BaseWebScraper): + """ + Reddit scraper for posts and comments. + + Extracts structured data from Reddit for: + - Posts (collect by URL) + - Posts (discover by keyword search) + - Posts (discover by subreddit URL) + - Comments (collect by URL) + + Example: + >>> scraper = RedditScraper(bearer_token="token") + >>> + >>> # Collect post data + >>> result = await scraper.posts( + ... url="https://www.reddit.com/r/python/comments/abc123/..." + ... ) + >>> + >>> # Discover posts by keyword + >>> result = await scraper.posts_by_keyword( + ... keyword="machine learning", + ... sort_by="Top" + ... ) + >>> + >>> # Discover posts from subreddit + >>> result = await scraper.posts_by_subreddit( + ... url="https://www.reddit.com/r/datascience/", + ... sort_by="Hot" + ... ) + >>> + >>> # Collect comments + >>> result = await scraper.comments( + ... url="https://www.reddit.com/r/python/comments/abc123/comment/xyz789/", + ... days_back=30 + ... ) + """ + + # Dataset IDs + DATASET_ID = "gd_lvz8ah06191smkebj4" # Posts (default) + DATASET_ID_POSTS = "gd_lvz8ah06191smkebj4" + DATASET_ID_COMMENTS = "gd_lvzdpsdlw09j6t702" + + PLATFORM_NAME = "reddit" + MIN_POLL_TIMEOUT = DEFAULT_TIMEOUT_MEDIUM + COST_PER_RECORD = DEFAULT_COST_PER_RECORD + + # ============================================================================ + # POSTS - Collect by URL + # ============================================================================ + + async def posts( + self, + url: Union[str, List[str]], + timeout: int = DEFAULT_TIMEOUT_MEDIUM, + ) -> Union[ScrapeResult, List[ScrapeResult]]: + """ + Collect Reddit post data by URL (async). + + Args: + url: Post URL(s) like https://www.reddit.com/r/subreddit/comments/... + timeout: Maximum wait time in seconds (default: 240) + + Returns: + ScrapeResult or List[ScrapeResult] with post data + + Example: + >>> result = await scraper.posts( + ... url="https://www.reddit.com/r/python/comments/abc123/my_post/" + ... ) + >>> print(result.data) + """ + if isinstance(url, str): + validate_url(url) + else: + validate_url_list(url) + + is_single = isinstance(url, str) + url_list = [url] if is_single else url + payload = [{"url": u} for u in url_list] + + sdk_function = get_caller_function_name() + result = await self.workflow_executor.execute( + payload=payload, + dataset_id=self.DATASET_ID_POSTS, + poll_interval=DEFAULT_POLL_INTERVAL, + poll_timeout=timeout, + include_errors=True, + sdk_function=sdk_function, + normalize_func=self.normalize_result, + ) + + if is_single and isinstance(result.data, list) and len(result.data) == 1: + result.url = url if isinstance(url, str) else url[0] + result.data = result.data[0] + return result + + def posts_sync( + self, + url: Union[str, List[str]], + timeout: int = DEFAULT_TIMEOUT_MEDIUM, + ) -> Union[ScrapeResult, List[ScrapeResult]]: + """Collect Reddit post data by URL (sync).""" + + async def _run(): + async with self.engine: + return await self.posts(url, timeout) + + return asyncio.run(_run()) + + # --- Posts Trigger/Status/Fetch --- + + async def posts_trigger(self, url: Union[str, List[str]]) -> ScrapeJob: + """Trigger Reddit posts collection (manual control).""" + url_list = [url] if isinstance(url, str) else url + payload = [{"url": u} for u in url_list] + + snapshot_id = await self.api_client.trigger( + payload=payload, dataset_id=self.DATASET_ID_POSTS + ) + return ScrapeJob( + snapshot_id=snapshot_id, + api_client=self.api_client, + platform_name=self.PLATFORM_NAME, + cost_per_record=self.COST_PER_RECORD, + ) + + def posts_trigger_sync(self, url: Union[str, List[str]]) -> ScrapeJob: + """Trigger Reddit posts collection (sync).""" + return asyncio.run(self.posts_trigger(url)) + + async def posts_status(self, snapshot_id: str) -> str: + """Check Reddit posts collection status.""" + return await self._check_status_async(snapshot_id) + + def posts_status_sync(self, snapshot_id: str) -> str: + """Check Reddit posts collection status (sync).""" + return asyncio.run(self.posts_status(snapshot_id)) + + async def posts_fetch(self, snapshot_id: str) -> Any: + """Fetch Reddit posts results.""" + return await self._fetch_results_async(snapshot_id) + + def posts_fetch_sync(self, snapshot_id: str) -> Any: + """Fetch Reddit posts results (sync).""" + return asyncio.run(self.posts_fetch(snapshot_id)) + + # ============================================================================ + # POSTS - Discover by Keyword + # ============================================================================ + + async def posts_by_keyword( + self, + keyword: Union[str, List[str]], + date: Optional[Union[str, List[str]]] = None, + num_of_posts: Optional[Union[int, List[int]]] = None, + sort_by: Optional[Union[str, List[str]]] = None, + timeout: int = DEFAULT_TIMEOUT_MEDIUM, + ) -> ScrapeResult: + """ + Discover Reddit posts by keyword search (async). + + Args: + keyword: Search keyword(s) + date: Time filter - "All time", "Past year", "Past month", "Past week", + "Past 24 hours", "Past hour" (optional) + num_of_posts: Maximum number of posts to collect (optional) + sort_by: Sort order - "Hot", "Top", "New", "Rising" (optional) + timeout: Maximum wait time in seconds (default: 240) + + Returns: + ScrapeResult with discovered posts + + Example: + >>> result = await scraper.posts_by_keyword( + ... keyword="machine learning", + ... date="Past week", + ... sort_by="Top", + ... num_of_posts=50 + ... ) + """ + keywords = [keyword] if isinstance(keyword, str) else keyword + batch_size = len(keywords) + dates = self._normalize_param(date, batch_size, None) + nums = self._normalize_param(num_of_posts, batch_size, None) + sorts = self._normalize_param(sort_by, batch_size, None) + + payload = [] + for i in range(batch_size): + item: Dict[str, Any] = {"keyword": keywords[i]} + if dates[i] is not None: + item["date"] = dates[i] + if nums[i] is not None: + item["num_of_posts"] = nums[i] + if sorts[i] is not None: + item["sort_by"] = sorts[i] + payload.append(item) + + sdk_function = get_caller_function_name() + result = await self.workflow_executor.execute( + payload=payload, + dataset_id=self.DATASET_ID_POSTS, + poll_interval=DEFAULT_POLL_INTERVAL, + poll_timeout=timeout, + include_errors=True, + sdk_function=sdk_function, + normalize_func=self.normalize_result, + extra_params={"type": "discover_new", "discover_by": "keyword"}, + ) + return result + + def posts_by_keyword_sync( + self, + keyword: Union[str, List[str]], + date: Optional[Union[str, List[str]]] = None, + num_of_posts: Optional[Union[int, List[int]]] = None, + sort_by: Optional[Union[str, List[str]]] = None, + timeout: int = DEFAULT_TIMEOUT_MEDIUM, + ) -> ScrapeResult: + """Discover Reddit posts by keyword search (sync).""" + + async def _run(): + async with self.engine: + return await self.posts_by_keyword(keyword, date, num_of_posts, sort_by, timeout) + + return asyncio.run(_run()) + + # ============================================================================ + # POSTS - Discover by Subreddit URL + # ============================================================================ + + async def posts_by_subreddit( + self, + url: Union[str, List[str]], + sort_by: Optional[Union[str, List[str]]] = None, + sort_by_time: Optional[Union[str, List[str]]] = None, + timeout: int = DEFAULT_TIMEOUT_MEDIUM, + ) -> ScrapeResult: + """ + Discover Reddit posts from subreddit URL (async). + + Args: + url: Subreddit URL(s) like https://www.reddit.com/r/datascience/ + sort_by: Sort order - "Hot", "New", "Rising", "Top" (optional) + sort_by_time: Time filter for sort - "Today", "Past week", + "Past month", "Past year", "All Time" (optional) + timeout: Maximum wait time in seconds (default: 240) + + Returns: + ScrapeResult with discovered posts + + Example: + >>> result = await scraper.posts_by_subreddit( + ... url="https://www.reddit.com/r/datascience/", + ... sort_by="Rising", + ... sort_by_time="All Time" + ... ) + """ + if isinstance(url, str): + validate_url(url) + else: + validate_url_list(url) + + urls = [url] if isinstance(url, str) else url + batch_size = len(urls) + sorts = self._normalize_param(sort_by, batch_size, None) + sort_times = self._normalize_param(sort_by_time, batch_size, None) + + payload = [] + for i in range(batch_size): + item: Dict[str, Any] = {"url": urls[i]} + if sorts[i] is not None: + item["sort_by"] = sorts[i] + if sort_times[i] is not None: + item["sort_by_time"] = sort_times[i] + payload.append(item) + + sdk_function = get_caller_function_name() + result = await self.workflow_executor.execute( + payload=payload, + dataset_id=self.DATASET_ID_POSTS, + poll_interval=DEFAULT_POLL_INTERVAL, + poll_timeout=timeout, + include_errors=True, + sdk_function=sdk_function, + normalize_func=self.normalize_result, + extra_params={"type": "discover_new", "discover_by": "subreddit_url"}, + ) + return result + + def posts_by_subreddit_sync( + self, + url: Union[str, List[str]], + sort_by: Optional[Union[str, List[str]]] = None, + sort_by_time: Optional[Union[str, List[str]]] = None, + timeout: int = DEFAULT_TIMEOUT_MEDIUM, + ) -> ScrapeResult: + """Discover Reddit posts from subreddit URL (sync).""" + + async def _run(): + async with self.engine: + return await self.posts_by_subreddit(url, sort_by, sort_by_time, timeout) + + return asyncio.run(_run()) + + # ============================================================================ + # COMMENTS - Collect by URL + # ============================================================================ + + async def comments( + self, + url: Union[str, List[str]], + days_back: Optional[int] = None, + load_all_replies: Optional[bool] = None, + comment_limit: Optional[int] = None, + timeout: int = DEFAULT_TIMEOUT_MEDIUM, + ) -> Union[ScrapeResult, List[ScrapeResult]]: + """ + Collect Reddit comments by URL (async). + + Args: + url: Comment thread or post URL(s) + days_back: Number of days to look back (optional) + load_all_replies: Whether to load all nested replies (optional) + comment_limit: Maximum number of comments to collect (optional) + timeout: Maximum wait time in seconds (default: 240) + + Returns: + ScrapeResult or List[ScrapeResult] with comment data + + Example: + >>> result = await scraper.comments( + ... url="https://www.reddit.com/r/python/comments/abc123/comment/xyz789/", + ... days_back=30, + ... load_all_replies=True, + ... comment_limit=100 + ... ) + """ + if isinstance(url, str): + validate_url(url) + else: + validate_url_list(url) + + is_single = isinstance(url, str) + url_list = [url] if is_single else url + + payload = [] + for u in url_list: + item: Dict[str, Any] = {"url": u} + if days_back is not None: + item["days_back"] = days_back + if load_all_replies is not None: + item["load_all_replies"] = load_all_replies + if comment_limit is not None: + item["comment_limit"] = comment_limit + payload.append(item) + + sdk_function = get_caller_function_name() + result = await self.workflow_executor.execute( + payload=payload, + dataset_id=self.DATASET_ID_COMMENTS, + poll_interval=DEFAULT_POLL_INTERVAL, + poll_timeout=timeout, + include_errors=True, + sdk_function=sdk_function, + normalize_func=self.normalize_result, + ) + + if is_single and isinstance(result.data, list) and len(result.data) == 1: + result.url = url if isinstance(url, str) else url[0] + result.data = result.data[0] + return result + + def comments_sync( + self, + url: Union[str, List[str]], + days_back: Optional[int] = None, + load_all_replies: Optional[bool] = None, + comment_limit: Optional[int] = None, + timeout: int = DEFAULT_TIMEOUT_MEDIUM, + ) -> Union[ScrapeResult, List[ScrapeResult]]: + """Collect Reddit comments by URL (sync).""" + + async def _run(): + async with self.engine: + return await self.comments(url, days_back, load_all_replies, comment_limit, timeout) + + return asyncio.run(_run()) + + # --- Comments Trigger/Status/Fetch --- + + async def comments_trigger( + self, + url: Union[str, List[str]], + days_back: Optional[int] = None, + load_all_replies: Optional[bool] = None, + comment_limit: Optional[int] = None, + ) -> ScrapeJob: + """Trigger Reddit comments collection (manual control).""" + url_list = [url] if isinstance(url, str) else url + + payload = [] + for u in url_list: + item: Dict[str, Any] = {"url": u} + if days_back is not None: + item["days_back"] = days_back + if load_all_replies is not None: + item["load_all_replies"] = load_all_replies + if comment_limit is not None: + item["comment_limit"] = comment_limit + payload.append(item) + + snapshot_id = await self.api_client.trigger( + payload=payload, dataset_id=self.DATASET_ID_COMMENTS + ) + return ScrapeJob( + snapshot_id=snapshot_id, + api_client=self.api_client, + platform_name=self.PLATFORM_NAME, + cost_per_record=self.COST_PER_RECORD, + ) + + def comments_trigger_sync( + self, + url: Union[str, List[str]], + days_back: Optional[int] = None, + load_all_replies: Optional[bool] = None, + comment_limit: Optional[int] = None, + ) -> ScrapeJob: + """Trigger Reddit comments collection (sync).""" + return asyncio.run(self.comments_trigger(url, days_back, load_all_replies, comment_limit)) + + async def comments_status(self, snapshot_id: str) -> str: + """Check Reddit comments collection status.""" + return await self._check_status_async(snapshot_id) + + def comments_status_sync(self, snapshot_id: str) -> str: + """Check Reddit comments collection status (sync).""" + return asyncio.run(self.comments_status(snapshot_id)) + + async def comments_fetch(self, snapshot_id: str) -> Any: + """Fetch Reddit comments results.""" + return await self._fetch_results_async(snapshot_id) + + def comments_fetch_sync(self, snapshot_id: str) -> Any: + """Fetch Reddit comments results (sync).""" + return asyncio.run(self.comments_fetch(snapshot_id)) + + # ============================================================================ + # HELPER METHODS + # ============================================================================ + + def _normalize_param( + self, + param: Optional[Union[Any, List[Any]]], + target_length: int, + default_value: Any = None, + ) -> List[Any]: + """Normalize parameter to list of specified length.""" + if param is None: + return [default_value] * target_length + + if isinstance(param, (str, bool, int)): + return [param] * target_length + + if isinstance(param, list): + if len(param) < target_length: + last_val = param[-1] if param else default_value + return param + [last_val] * (target_length - len(param)) + return param[:target_length] + + return [default_value] * target_length diff --git a/src/brightdata/api/scrape_service.py b/src/brightdata/scrapers/service.py similarity index 78% rename from src/brightdata/api/scrape_service.py rename to src/brightdata/scrapers/service.py index 4b7b6d0..9aab941 100644 --- a/src/brightdata/api/scrape_service.py +++ b/src/brightdata/scrapers/service.py @@ -29,6 +29,8 @@ def __init__(self, client: "BrightDataClient"): self._perplexity = None self._tiktok = None self._youtube = None + self._digikey = None + self._reddit = None @property def amazon(self): @@ -46,7 +48,7 @@ def amazon(self): >>> result = client.scrape.amazon.products(keyword="laptop") """ if self._amazon is None: - from ..scrapers.amazon import AmazonScraper + from .amazon import AmazonScraper self._amazon = AmazonScraper( bearer_token=self._client.token, engine=self._client.engine @@ -75,7 +77,7 @@ def linkedin(self): >>> result = client.scrape.linkedin.companies(keyword="tech startup") """ if self._linkedin is None: - from ..scrapers.linkedin import LinkedInScraper + from .linkedin import LinkedInScraper self._linkedin = LinkedInScraper( bearer_token=self._client.token, engine=self._client.engine @@ -101,7 +103,7 @@ def chatgpt(self): ... ]) """ if self._chatgpt is None: - from ..scrapers.chatgpt import ChatGPTScraper + from .chatgpt import ChatGPTScraper self._chatgpt = ChatGPTScraper( bearer_token=self._client.token, engine=self._client.engine @@ -140,7 +142,7 @@ def facebook(self): ... ) """ if self._facebook is None: - from ..scrapers.facebook import FacebookScraper + from .facebook import FacebookScraper self._facebook = FacebookScraper( bearer_token=self._client.token, engine=self._client.engine @@ -177,7 +179,7 @@ def instagram(self): ... ) """ if self._instagram is None: - from ..scrapers.instagram import InstagramScraper + from .instagram import InstagramScraper self._instagram = InstagramScraper( bearer_token=self._client.token, engine=self._client.engine @@ -206,7 +208,7 @@ def perplexity(self): ... ) """ if self._perplexity is None: - from ..scrapers.perplexity import PerplexityScraper + from .perplexity import PerplexityScraper self._perplexity = PerplexityScraper( bearer_token=self._client.token, engine=self._client.engine @@ -249,7 +251,7 @@ def tiktok(self): ... ) """ if self._tiktok is None: - from ..scrapers.tiktok import TikTokScraper + from .tiktok import TikTokScraper self._tiktok = TikTokScraper( bearer_token=self._client.token, engine=self._client.engine @@ -282,9 +284,69 @@ def youtube(self): ... ) """ if self._youtube is None: - from ..scrapers.youtube import YouTubeScraper + from .youtube import YouTubeScraper self._youtube = YouTubeScraper( bearer_token=self._client.token, engine=self._client.engine ) return self._youtube + + @property + def digikey(self): + """ + Access DigiKey scraper. + + Returns: + DigiKeyScraper instance for DigiKey product scraping + + Example: + >>> # Collect product data + >>> result = await client.scrape.digikey.products( + ... url="https://www.digikey.com/en/products/detail/..." + ... ) + >>> + >>> # Discover by category + >>> result = await client.scrape.digikey.discover_by_category( + ... url="https://www.digikey.com/en/products/category/..." + ... ) + """ + if self._digikey is None: + from .digikey import DigiKeyScraper + + self._digikey = DigiKeyScraper( + bearer_token=self._client.token, engine=self._client.engine + ) + return self._digikey + + @property + def reddit(self): + """ + Access Reddit scraper. + + Returns: + RedditScraper instance for Reddit data extraction + + Example: + >>> # Collect post data + >>> result = await client.scrape.reddit.posts( + ... url="https://www.reddit.com/r/python/comments/..." + ... ) + >>> + >>> # Discover posts by keyword + >>> result = await client.scrape.reddit.posts_by_keyword( + ... keyword="machine learning", + ... sort_by="Top" + ... ) + >>> + >>> # Collect comments + >>> result = await client.scrape.reddit.comments( + ... url="https://www.reddit.com/r/python/comments/.../comment/..." + ... ) + """ + if self._reddit is None: + from .reddit import RedditScraper + + self._reddit = RedditScraper( + bearer_token=self._client.token, engine=self._client.engine + ) + return self._reddit diff --git a/src/brightdata/api/serp/__init__.py b/src/brightdata/serp/__init__.py similarity index 83% rename from src/brightdata/api/serp/__init__.py rename to src/brightdata/serp/__init__.py index e244727..1ee43bc 100644 --- a/src/brightdata/api/serp/__init__.py +++ b/src/brightdata/serp/__init__.py @@ -4,10 +4,12 @@ from .google import GoogleSERPService from .bing import BingSERPService from .yandex import YandexSERPService +from .service import SearchService __all__ = [ "BaseSERPService", "GoogleSERPService", "BingSERPService", "YandexSERPService", + "SearchService", ] diff --git a/src/brightdata/api/serp/base.py b/src/brightdata/serp/base.py similarity index 98% rename from src/brightdata/api/serp/base.py rename to src/brightdata/serp/base.py index a151e01..ce4a05e 100644 --- a/src/brightdata/api/serp/base.py +++ b/src/brightdata/serp/base.py @@ -11,14 +11,14 @@ from .url_builder import BaseURLBuilder from .data_normalizer import BaseDataNormalizer -from ...core.engine import AsyncEngine -from ...models import SearchResult -from ...constants import HTTP_OK -from ...exceptions import ValidationError -from ...utils.validation import validate_zone_name -from ...utils.retry import retry_with_backoff -from ...utils.function_detection import get_caller_function_name -from ..async_unblocker import AsyncUnblockerClient +from ..core.engine import AsyncEngine +from ..models import SearchResult +from http import HTTPStatus +from ..exceptions import ValidationError +from ..utils.validation import validate_zone_name +from ..utils.retry import retry_with_backoff +from ..utils.function_detection import get_caller_function_name +from ..web_unlocker.async_client import AsyncUnblockerClient class BaseSERPService: @@ -268,7 +268,7 @@ async def _make_request(): ) as response: data_fetched_at = datetime.now(timezone.utc) - if response.status == HTTP_OK: + if response.status == HTTPStatus.OK: text = await response.text() try: data = json.loads(text) diff --git a/src/brightdata/api/serp/bing.py b/src/brightdata/serp/bing.py similarity index 95% rename from src/brightdata/api/serp/bing.py rename to src/brightdata/serp/bing.py index d27066e..463e506 100644 --- a/src/brightdata/api/serp/bing.py +++ b/src/brightdata/serp/bing.py @@ -4,7 +4,7 @@ from .base import BaseSERPService from .url_builder import BingURLBuilder from .data_normalizer import BingDataNormalizer -from ...core.engine import AsyncEngine +from ..core.engine import AsyncEngine class BingSERPService(BaseSERPService): diff --git a/src/brightdata/api/serp/data_normalizer.py b/src/brightdata/serp/data_normalizer.py similarity index 99% rename from src/brightdata/api/serp/data_normalizer.py rename to src/brightdata/serp/data_normalizer.py index f1fa2af..c4cf102 100644 --- a/src/brightdata/api/serp/data_normalizer.py +++ b/src/brightdata/serp/data_normalizer.py @@ -3,7 +3,7 @@ import warnings from abc import ABC, abstractmethod from typing import Any -from ...types import NormalizedSERPData +from ..types import NormalizedSERPData class BaseDataNormalizer(ABC): diff --git a/src/brightdata/api/serp/google.py b/src/brightdata/serp/google.py similarity index 96% rename from src/brightdata/api/serp/google.py rename to src/brightdata/serp/google.py index 097d286..3bf662a 100644 --- a/src/brightdata/api/serp/google.py +++ b/src/brightdata/serp/google.py @@ -4,7 +4,7 @@ from .base import BaseSERPService from .url_builder import GoogleURLBuilder from .data_normalizer import GoogleDataNormalizer -from ...core.engine import AsyncEngine +from ..core.engine import AsyncEngine class GoogleSERPService(BaseSERPService): diff --git a/src/brightdata/api/search_service.py b/src/brightdata/serp/service.py similarity index 97% rename from src/brightdata/api/search_service.py rename to src/brightdata/serp/service.py index 410da46..1e59051 100644 --- a/src/brightdata/api/search_service.py +++ b/src/brightdata/serp/service.py @@ -12,9 +12,9 @@ if TYPE_CHECKING: from ..client import BrightDataClient - from .serp.google import GoogleSERPService - from .serp.bing import BingSERPService - from .serp.yandex import YandexSERPService + from .google import GoogleSERPService + from .bing import BingSERPService + from .yandex import YandexSERPService from ..scrapers.amazon.search import AmazonSearchScraper from ..scrapers.linkedin.search import LinkedInSearchScraper from ..scrapers.chatgpt.search import ChatGPTSearchService @@ -88,7 +88,7 @@ async def google( ... num_results=20 ... ) """ - from .serp import GoogleSERPService + from .google import GoogleSERPService if self._google_service is None: self._google_service = GoogleSERPService( @@ -117,7 +117,7 @@ async def bing( **kwargs, ) -> Union[SearchResult, List[SearchResult]]: """Search Bing asynchronously.""" - from .serp import BingSERPService + from .bing import BingSERPService if self._bing_service is None: self._bing_service = BingSERPService( @@ -145,7 +145,7 @@ async def yandex( **kwargs, ) -> Union[SearchResult, List[SearchResult]]: """Search Yandex asynchronously.""" - from .serp import YandexSERPService + from .yandex import YandexSERPService if self._yandex_service is None: self._yandex_service = YandexSERPService( diff --git a/src/brightdata/api/serp/url_builder.py b/src/brightdata/serp/url_builder.py similarity index 98% rename from src/brightdata/api/serp/url_builder.py rename to src/brightdata/serp/url_builder.py index 5397177..e25bc96 100644 --- a/src/brightdata/api/serp/url_builder.py +++ b/src/brightdata/serp/url_builder.py @@ -3,7 +3,7 @@ from abc import ABC, abstractmethod from typing import Optional from urllib.parse import quote_plus -from ...utils.location import LocationService, LocationFormat +from ..utils.location import LocationService, LocationFormat class BaseURLBuilder(ABC): diff --git a/src/brightdata/api/serp/yandex.py b/src/brightdata/serp/yandex.py similarity index 95% rename from src/brightdata/api/serp/yandex.py rename to src/brightdata/serp/yandex.py index 1fc8cb6..6173876 100644 --- a/src/brightdata/api/serp/yandex.py +++ b/src/brightdata/serp/yandex.py @@ -4,7 +4,7 @@ from .base import BaseSERPService from .url_builder import YandexURLBuilder from .data_normalizer import YandexDataNormalizer -from ...core.engine import AsyncEngine +from ..core.engine import AsyncEngine class YandexSERPService(BaseSERPService): diff --git a/src/brightdata/sync_client.py b/src/brightdata/sync_client.py index 4c57e67..c4c1116 100644 --- a/src/brightdata/sync_client.py +++ b/src/brightdata/sync_client.py @@ -5,10 +5,13 @@ """ import asyncio +import logging from typing import Optional, List, Dict, Any +logger = logging.getLogger(__name__) + from .client import BrightDataClient -from .api.browser_service import BrowserService +from .browser.service import BrowserService from .models import ScrapeResult, SearchResult from .types import AccountInfo @@ -48,7 +51,7 @@ def __init__( Initialize sync client. Args: - token: Bright Data API token (or set BRIGHT_DATA_API_TOKEN env var) + token: Bright Data API token (or set BRIGHTDATA_API_TOKEN env var) timeout: Default request timeout in seconds web_unlocker_zone: Zone name for Web Unlocker API serp_zone: Zone name for SERP API @@ -61,19 +64,18 @@ def __init__( rate_limit: Rate limit (requests per period) rate_period: Rate limit period in seconds """ - # Check if we're inside an async context - FIXED logic + # Check if we're inside an async context + loop_running = True try: asyncio.get_running_loop() - # If we get here, there IS a running loop - this is an error + except RuntimeError: + loop_running = False + + if loop_running: raise RuntimeError( - "SyncBrightDataClient cannot be used inside async context. " + "SyncBrightDataClient cannot be used inside an async context. " "Use BrightDataClient with async/await instead." ) - except RuntimeError as e: - # Only pass if it's the "no running event loop" error - if "no running event loop" not in str(e).lower(): - raise # Re-raise our custom error or other RuntimeErrors - # No running loop - correct for sync usage, continue self._async_client = BrightDataClient( token=token, @@ -138,14 +140,13 @@ def __exit__(self, exc_type, exc_val, exc_tb): if pending: self._loop.run_until_complete(asyncio.gather(*pending, return_exceptions=True)) except Exception: - # Ignore errors during cleanup - pass + logger.debug("Error during SyncBrightDataClient cleanup", exc_info=True) finally: # Close the loop try: self._loop.close() except Exception: - pass + logger.debug("Error closing event loop", exc_info=True) self._loop = None def _run(self, coro): diff --git a/src/brightdata/types.py b/src/brightdata/types.py index bc08f0c..43a7807 100644 --- a/src/brightdata/types.py +++ b/src/brightdata/types.py @@ -1,231 +1,12 @@ """ Type definitions for Bright Data SDK. -This module provides type definitions for API responses and configuration. - -NOTE: Payload types have been migrated to dataclasses in payloads.py for: -- Runtime validation -- Default values -- Better IDE support -- Consistent developer experience with result models - -For backward compatibility, TypedDict versions are kept here but deprecated. -New code should use dataclasses from payloads.py instead. +This module provides type definitions for API responses used internally. """ -from typing import TypedDict, Optional, List, Literal, Union, Any, Dict +from typing import TypedDict, Optional, List, Dict, Any from typing_extensions import NotRequired -# Import dataclass payloads for backward compatibility - - -# DEPRECATED: TypedDict payloads kept for backward compatibility only -# Use dataclass versions from payloads.py for new code - - -class DatasetTriggerPayload(TypedDict, total=False): - """DEPRECATED: Use payloads.DatasetTriggerPayload (dataclass) instead.""" - - url: str - keyword: str - location: str - country: str - max_results: int - - -class AmazonProductPayload(TypedDict, total=False): - """DEPRECATED: Use payloads.AmazonProductPayload (dataclass) instead.""" - - url: str - reviews_count: NotRequired[int] - images_count: NotRequired[int] - - -class AmazonReviewPayload(TypedDict, total=False): - """DEPRECATED: Use payloads.AmazonReviewPayload (dataclass) instead.""" - - url: str - pastDays: NotRequired[int] - keyWord: NotRequired[str] - numOfReviews: NotRequired[int] - - -class LinkedInProfilePayload(TypedDict, total=False): - """DEPRECATED: Use payloads.LinkedInProfilePayload (dataclass) instead.""" - - url: str - - -class LinkedInJobPayload(TypedDict, total=False): - """DEPRECATED: Use payloads.LinkedInJobPayload (dataclass) instead.""" - - url: str - - -class LinkedInCompanyPayload(TypedDict, total=False): - """DEPRECATED: Use payloads.LinkedInCompanyPayload (dataclass) instead.""" - - url: str - - -class LinkedInPostPayload(TypedDict, total=False): - """DEPRECATED: Use payloads.LinkedInPostPayload (dataclass) instead.""" - - url: str - - -class LinkedInProfileSearchPayload(TypedDict, total=False): - """DEPRECATED: Use payloads.LinkedInProfileSearchPayload (dataclass) instead.""" - - firstName: str - lastName: NotRequired[str] - title: NotRequired[str] - company: NotRequired[str] - location: NotRequired[str] - max_results: NotRequired[int] - - -class LinkedInJobSearchPayload(TypedDict, total=False): - """DEPRECATED: Use payloads.LinkedInJobSearchPayload (dataclass) instead.""" - - url: NotRequired[str] - keyword: NotRequired[str] - location: NotRequired[str] - country: NotRequired[str] - timeRange: NotRequired[str] - jobType: NotRequired[str] - experienceLevel: NotRequired[str] - remote: NotRequired[bool] - company: NotRequired[str] - locationRadius: NotRequired[str] - - -class LinkedInPostSearchPayload(TypedDict, total=False): - """DEPRECATED: Use payloads.LinkedInPostSearchPayload (dataclass) instead.""" - - profile_url: str - start_date: NotRequired[str] - end_date: NotRequired[str] - - -class ChatGPTPromptPayload(TypedDict, total=False): - """DEPRECATED: Use payloads.ChatGPTPromptPayload (dataclass) instead.""" - - prompt: str - country: NotRequired[str] - web_search: NotRequired[bool] - additional_prompt: NotRequired[str] - - -class FacebookPostsProfilePayload(TypedDict, total=False): - """DEPRECATED: Use payloads.FacebookPostsProfilePayload (dataclass) instead.""" - - url: str - num_of_posts: NotRequired[int] - posts_to_not_include: NotRequired[List[str]] - start_date: NotRequired[str] - end_date: NotRequired[str] - - -class FacebookPostsGroupPayload(TypedDict, total=False): - """DEPRECATED: Use payloads.FacebookPostsGroupPayload (dataclass) instead.""" - - url: str - num_of_posts: NotRequired[int] - posts_to_not_include: NotRequired[List[str]] - start_date: NotRequired[str] - end_date: NotRequired[str] - - -class FacebookPostPayload(TypedDict, total=False): - """DEPRECATED: Use payloads.FacebookPostPayload (dataclass) instead.""" - - url: str - - -class FacebookCommentsPayload(TypedDict, total=False): - """DEPRECATED: Use payloads.FacebookCommentsPayload (dataclass) instead.""" - - url: str - num_of_comments: NotRequired[int] - comments_to_not_include: NotRequired[List[str]] - start_date: NotRequired[str] - end_date: NotRequired[str] - - -class FacebookReelsPayload(TypedDict, total=False): - """DEPRECATED: Use payloads.FacebookReelsPayload (dataclass) instead.""" - - url: str - num_of_posts: NotRequired[int] - posts_to_not_include: NotRequired[List[str]] - start_date: NotRequired[str] - end_date: NotRequired[str] - - -class InstagramProfilePayload(TypedDict, total=False): - """DEPRECATED: Use payloads.InstagramProfilePayload (dataclass) instead.""" - - url: str - - -class InstagramPostPayload(TypedDict, total=False): - """DEPRECATED: Use payloads.InstagramPostPayload (dataclass) instead.""" - - url: str - - -class InstagramCommentPayload(TypedDict, total=False): - """DEPRECATED: Use payloads.InstagramCommentPayload (dataclass) instead.""" - - url: str - - -class InstagramReelPayload(TypedDict, total=False): - """DEPRECATED: Use payloads.InstagramReelPayload (dataclass) instead.""" - - url: str - - -class InstagramPostsDiscoverPayload(TypedDict, total=False): - """DEPRECATED: Use payloads.InstagramPostsDiscoverPayload (dataclass) instead.""" - - url: str - num_of_posts: NotRequired[int] - posts_to_not_include: NotRequired[List[str]] - start_date: NotRequired[str] - end_date: NotRequired[str] - post_type: NotRequired[str] - - -class InstagramReelsDiscoverPayload(TypedDict, total=False): - """DEPRECATED: Use payloads.InstagramReelsDiscoverPayload (dataclass) instead.""" - - url: str - num_of_posts: NotRequired[int] - posts_to_not_include: NotRequired[List[str]] - start_date: NotRequired[str] - end_date: NotRequired[str] - - -class TriggerResponse(TypedDict): - """Response from /datasets/v3/trigger.""" - - snapshot_id: str - - -class ProgressResponse(TypedDict): - """Response from /datasets/v3/progress/{snapshot_id}.""" - - status: Literal["ready", "in_progress", "error", "failed"] - progress: NotRequired[int] - - -class SnapshotResponse(TypedDict): - """Response from /datasets/v3/snapshot/{snapshot_id}.""" - - data: List[Dict[str, Any]] - class ZoneInfo(TypedDict, total=False): """Zone information from API.""" @@ -237,19 +18,6 @@ class ZoneInfo(TypedDict, total=False): created: NotRequired[str] -DeviceType = Literal["desktop", "mobile", "tablet"] -ResponseFormat = Literal["raw", "json"] -HTTPMethod = Literal["GET", "POST", "PUT", "DELETE", "PATCH", "HEAD", "OPTIONS"] -SearchEngine = Literal["google", "bing", "yandex"] -Platform = Literal["amazon", "linkedin", "chatgpt", "instagram", "reddit"] - - -URLParam = Union[str, List[str]] -OptionalURLParam = Optional[Union[str, List[str]]] -StringParam = Union[str, List[str]] -OptionalStringParam = Optional[Union[str, List[str]]] - - class AccountInfo(TypedDict): """Account information returned by get_account_info().""" @@ -301,50 +69,10 @@ class NormalizedSERPData(TypedDict, total=False): __all__ = [ - # Payloads - "DatasetTriggerPayload", - "AmazonProductPayload", - "AmazonReviewPayload", - "LinkedInProfilePayload", - "LinkedInJobPayload", - "LinkedInCompanyPayload", - "LinkedInPostPayload", - "LinkedInProfileSearchPayload", - "LinkedInJobSearchPayload", - "LinkedInPostSearchPayload", - "ChatGPTPromptPayload", - "FacebookPostsProfilePayload", - "FacebookPostsGroupPayload", - "FacebookPostPayload", - "FacebookCommentsPayload", - "FacebookReelsPayload", - # Instagram Payloads - "InstagramProfilePayload", - "InstagramPostPayload", - "InstagramCommentPayload", - "InstagramReelPayload", - "InstagramPostsDiscoverPayload", - "InstagramReelsDiscoverPayload", - # Responses - "TriggerResponse", - "ProgressResponse", - "SnapshotResponse", "ZoneInfo", "AccountInfo", - # SERP "SERPOrganicResult", "SERPFeaturedSnippet", "SERPKnowledgePanel", "NormalizedSERPData", - # Literals - "DeviceType", - "ResponseFormat", - "HTTPMethod", - "SearchEngine", - "Platform", - # Aliases - "URLParam", - "OptionalURLParam", - "StringParam", - "OptionalStringParam", ] diff --git a/src/brightdata/utils/parsing.py b/src/brightdata/utils/parsing.py deleted file mode 100644 index efec595..0000000 --- a/src/brightdata/utils/parsing.py +++ /dev/null @@ -1 +0,0 @@ -"""Content parsing.""" diff --git a/src/brightdata/utils/retry.py b/src/brightdata/utils/retry.py index 42b825f..6cb91d6 100644 --- a/src/brightdata/utils/retry.py +++ b/src/brightdata/utils/retry.py @@ -2,7 +2,7 @@ import asyncio from typing import Callable, Awaitable, TypeVar, Optional, List, Type -from ..exceptions import APIError, NetworkError, TimeoutError +from ..exceptions import APIError, NetworkError T = TypeVar("T") diff --git a/src/brightdata/utils/timing.py b/src/brightdata/utils/timing.py deleted file mode 100644 index 68da927..0000000 --- a/src/brightdata/utils/timing.py +++ /dev/null @@ -1 +0,0 @@ -"""Performance measurement.""" diff --git a/src/brightdata/utils/url.py b/src/brightdata/utils/url.py index 7cde4a9..5127b32 100644 --- a/src/brightdata/utils/url.py +++ b/src/brightdata/utils/url.py @@ -27,20 +27,3 @@ def extract_root_domain(url: str) -> Optional[str]: return netloc if netloc else None except Exception: return None - - -def is_valid_url(url: str) -> bool: - """ - Check if URL is valid. - - Args: - url: URL string to check. - - Returns: - True if URL is valid, False otherwise. - """ - try: - result = urlparse(url) - return bool(result.scheme and result.netloc) - except Exception: - return False diff --git a/src/brightdata/utils/validation.py b/src/brightdata/utils/validation.py index 49d9494..c38a52c 100644 --- a/src/brightdata/utils/validation.py +++ b/src/brightdata/utils/validation.py @@ -107,23 +107,6 @@ def validate_timeout(timeout: int) -> None: raise ValidationError(f"Timeout must be positive, got {timeout}") -def validate_max_workers(max_workers: int) -> None: - """ - Validate max_workers value. - - Args: - max_workers: Maximum number of workers. - - Raises: - ValidationError: If max_workers is invalid. - """ - if not isinstance(max_workers, int): - raise ValidationError("max_workers must be an integer") - - if max_workers <= 0: - raise ValidationError(f"max_workers must be positive, got {max_workers}") - - def validate_response_format(response_format: str) -> None: """ Validate response format. diff --git a/src/brightdata/web_unlocker/__init__.py b/src/brightdata/web_unlocker/__init__.py new file mode 100644 index 0000000..32842dc --- /dev/null +++ b/src/brightdata/web_unlocker/__init__.py @@ -0,0 +1,5 @@ +"""Web Unlocker service.""" + +from .service import WebUnlockerService + +__all__ = ["WebUnlockerService"] diff --git a/src/brightdata/api/async_unblocker.py b/src/brightdata/web_unlocker/async_client.py similarity index 100% rename from src/brightdata/api/async_unblocker.py rename to src/brightdata/web_unlocker/async_client.py diff --git a/src/brightdata/api/base.py b/src/brightdata/web_unlocker/base.py similarity index 100% rename from src/brightdata/api/base.py rename to src/brightdata/web_unlocker/base.py diff --git a/src/brightdata/api/web_unlocker.py b/src/brightdata/web_unlocker/service.py similarity index 99% rename from src/brightdata/api/web_unlocker.py rename to src/brightdata/web_unlocker/service.py index c9c9b7a..db2feca 100644 --- a/src/brightdata/api/web_unlocker.py +++ b/src/brightdata/web_unlocker/service.py @@ -8,7 +8,7 @@ import asyncio from .base import BaseAPI -from .async_unblocker import AsyncUnblockerClient +from .async_client import AsyncUnblockerClient from ..models import ScrapeResult from ..utils.validation import ( validate_url, @@ -21,7 +21,7 @@ ) from ..utils.url import extract_root_domain from ..utils.function_detection import get_caller_function_name -from ..constants import HTTP_OK +from http import HTTPStatus from ..exceptions import ValidationError, APIError @@ -187,7 +187,7 @@ async def _scrape_single_async( ) as response: data_fetched_at = datetime.now(timezone.utc) - if response.status == HTTP_OK: + if response.status == HTTPStatus.OK: if response_format == "json": try: data = await response.json() diff --git a/tests/__init__.py b/tests/__init__.py index db49e82..e69de29 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1 +0,0 @@ -"""Test suite.""" diff --git a/tests/conftest.py b/tests/conftest.py index 84d2142..edcce14 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,8 +1,144 @@ -"""Pytest configuration.""" +"""Shared test fixtures for Bright Data SDK tests.""" import sys -from pathlib import Path +import os +from unittest.mock import AsyncMock, MagicMock +from typing import Any, Dict, Optional -# Add src directory to Python path -src_path = Path(__file__).parent.parent / "src" -sys.path.insert(0, str(src_path)) +import pytest + +# Add src to path +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src")) + + +# --------------------------------------------------------------------------- +# Mock HTTP Response +# --------------------------------------------------------------------------- + + +class MockResponse: + """Mock aiohttp response with configurable status, JSON, and text.""" + + def __init__( + self, + status: int = 200, + json_data: Any = None, + text_data: str = "", + headers: Optional[Dict[str, str]] = None, + ): + self.status = status + self._json_data = json_data + self._text_data = text_data + self.headers = headers or {} + self.closed = False + + async def json(self): + return self._json_data + + async def text(self): + return self._text_data + + async def release(self): + pass + + def close(self): + self.closed = True + + +class MockContextManager: + """Mock async context manager wrapping a MockResponse.""" + + def __init__(self, response: MockResponse): + self._response = response + + async def __aenter__(self): + return self._response + + async def __aexit__(self, *args): + pass + + +# --------------------------------------------------------------------------- +# Engine fixtures +# --------------------------------------------------------------------------- + + +@pytest.fixture +def mock_engine(): + """ + Reusable mock AsyncEngine. + + The engine's HTTP methods (get, post, get_from_url, post_to_url, etc.) + return MockContextManager by default. Override per-test by setting + the return_value on the method. + + Usage: + async with mock_engine.get("/endpoint") as response: + data = await response.json() + """ + engine = MagicMock() + engine.bearer_token = "test_token_123456789" + + default_response = MockResponse(200, json_data={}) + default_cm = MockContextManager(default_response) + + engine.get = MagicMock(return_value=default_cm) + engine.post = MagicMock(return_value=default_cm) + engine.delete = MagicMock(return_value=default_cm) + engine.get_from_url = MagicMock(return_value=default_cm) + engine.post_to_url = MagicMock(return_value=default_cm) + engine.request = MagicMock(return_value=default_cm) + + return engine + + +def make_engine_response(engine, method: str, response: MockResponse): + """ + Configure a mock engine method to return a specific response. + + Args: + engine: Mock engine from mock_engine fixture + method: Method name ("get", "post", "get_from_url", "post_to_url") + response: MockResponse to return + + Usage: + make_engine_response(mock_engine, "post_to_url", MockResponse(200, {"snapshot_id": "abc"})) + """ + cm = MockContextManager(response) + getattr(engine, method).return_value = cm + + +# --------------------------------------------------------------------------- +# API Client fixtures +# --------------------------------------------------------------------------- + + +@pytest.fixture +def mock_api_client(): + """Mock DatasetAPIClient with configurable trigger/status/fetch.""" + client = AsyncMock() + client.trigger = AsyncMock(return_value="snap_123") + client.get_status = AsyncMock(return_value="ready") + client.fetch_result = AsyncMock(return_value=[{"title": "Test"}]) + return client + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def make_status_sequence(statuses): + """ + Create an async side_effect that returns statuses in order. + + Usage: + mock_api_client.get_status.side_effect = make_status_sequence( + ["in_progress", "in_progress", "ready"] + ) + """ + + async def side_effect(*args, **kwargs): + return statuses.pop(0) + + return side_effect diff --git a/tests/e2e/__init__.py b/tests/e2e/__init__.py deleted file mode 100644 index 98b50e4..0000000 --- a/tests/e2e/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""End-to-end tests.""" diff --git a/tests/e2e/test_async_operations.py b/tests/e2e/test_async_operations.py deleted file mode 100644 index 618f036..0000000 --- a/tests/e2e/test_async_operations.py +++ /dev/null @@ -1 +0,0 @@ -"""E2E test for async operations.""" diff --git a/tests/e2e/test_batch_scrape.py b/tests/e2e/test_batch_scrape.py deleted file mode 100644 index b1fae12..0000000 --- a/tests/e2e/test_batch_scrape.py +++ /dev/null @@ -1 +0,0 @@ -"""E2E test for batch scraping.""" diff --git a/tests/e2e/test_client_e2e.py b/tests/e2e/test_client_e2e.py deleted file mode 100644 index 723b16e..0000000 --- a/tests/e2e/test_client_e2e.py +++ /dev/null @@ -1,315 +0,0 @@ -"""End-to-end tests for BrightDataClient hierarchical interface.""" - -import os -import pytest -from pathlib import Path - -# Load environment variables -try: - from dotenv import load_dotenv - - env_file = Path(__file__).parent.parent.parent.parent / ".env" - if env_file.exists(): - load_dotenv(env_file) -except ImportError: - pass - -from brightdata import BrightDataClient, SyncBrightDataClient - - -@pytest.fixture -def api_token(): - """Get API token from environment or skip tests.""" - token = os.getenv("BRIGHTDATA_API_TOKEN") - if not token: - pytest.skip("API token not found. Set BRIGHTDATA_API_TOKEN to run E2E tests.") - return token - - -@pytest.fixture -async def client(api_token): - """Create async client for testing.""" - async with BrightDataClient(token=api_token) as client: - yield client - - -class TestHierarchicalServiceAccess: - """Test the hierarchical service access pattern.""" - - def test_client_initialization_is_simple(self, api_token): - """Test client can be initialized with single line.""" - # Should work with environment variable - client = BrightDataClient() - assert client is not None - - # Should work with explicit token - client = BrightDataClient(token=api_token) - assert client is not None - - def test_service_properties_are_accessible(self, api_token): - """Test all service properties are accessible.""" - client = BrightDataClient(token=api_token) - - # All services should be accessible - assert client.scrape is not None - assert client.search is not None - assert client.crawler is not None - - def test_scrape_service_has_specialized_scrapers(self, api_token): - """Test scrape service provides access to specialized scrapers.""" - client = BrightDataClient(token=api_token) - - scrape = client.scrape - - # All scrapers should now be accessible - assert scrape.amazon is not None - assert scrape.linkedin is not None - assert scrape.chatgpt is not None - - # Verify they're the correct types - from brightdata.scrapers import AmazonScraper, LinkedInScraper, ChatGPTScraper - - assert isinstance(scrape.amazon, AmazonScraper) - assert isinstance(scrape.linkedin, LinkedInScraper) - assert isinstance(scrape.chatgpt, ChatGPTScraper) - - def test_search_service_has_search_engines(self, api_token): - """Test search service provides access to search engines.""" - client = BrightDataClient(token=api_token) - - search = client.search - - # All search engines should be callable - assert callable(search.google) - assert callable(search.google_async) - assert callable(search.bing) - assert callable(search.bing_async) - assert callable(search.yandex) - assert callable(search.yandex_async) - - def test_crawler_service_has_crawl_methods(self, api_token): - """Test crawler service provides crawling methods.""" - client = BrightDataClient(token=api_token) - - crawler = client.crawler - - # Should have crawler methods - assert hasattr(crawler, "discover") - assert hasattr(crawler, "sitemap") - assert callable(crawler.discover) - assert callable(crawler.sitemap) - - -class TestWebUnlocker: - """Test Web Unlocker scraping via scrape_url().""" - - @pytest.mark.asyncio - async def test_scrape_url_async(self, client): - """Test scrape_url() async.""" - result = await client.scrape_url(url="https://httpbin.org/html") - - assert result is not None - assert hasattr(result, "success") - assert hasattr(result, "data") - - def test_scrape_url_sync(self, api_token): - """Test scrape_url() synchronously using SyncBrightDataClient.""" - with SyncBrightDataClient(token=api_token) as client: - result = client.scrape_url(url="https://httpbin.org/html") - - assert result is not None - assert result.success or result.error is not None - - -class TestConnectionVerification: - """Test connection verification features.""" - - @pytest.mark.asyncio - async def test_connection_verification_workflow(self, client): - """Test complete connection verification workflow.""" - # Test connection - is_valid = await client.test_connection() - assert is_valid is True - - # Get account info - info = await client.get_account_info() - assert info is not None - assert isinstance(info, dict) - assert "zones" in info - - # Zones should be accessible - zones = info["zones"] - print(f"\n✅ Connected! Found {len(zones)} zones") - for zone in zones: - zone_name = zone.get("name", "unknown") - print(f" - {zone_name}") - - -class TestUserExperience: - """Test user experience matches requirements.""" - - def test_single_line_initialization(self): - """Test user can start with single line (environment variable).""" - # This should work if BRIGHTDATA_API_TOKEN is set - try: - client = BrightDataClient() - assert client is not None - print("\n✅ Single-line initialization works!") - except Exception as e: - pytest.skip(f"Environment variable not set: {e}") - - def test_clear_error_for_missing_credentials(self): - """Test error message is clear when credentials missing.""" - from unittest.mock import patch - - with pytest.raises(Exception) as exc_info: - with patch.dict(os.environ, {}, clear=True): - BrightDataClient() - - error_msg = str(exc_info.value) - assert "API token" in error_msg - assert "brightdata.com" in error_msg.lower() - - def test_hierarchical_access_is_intuitive(self, api_token): - """Test hierarchical access follows intuitive pattern.""" - client = BrightDataClient(token=api_token) - - # Pattern: client.{service}.{platform}.{action} - # Should be discoverable and intuitive - - # Scraping path - scrape_path = client.scrape - assert scrape_path is not None - - # Generic scraping (implemented) - generic_scraper = scrape_path.generic - assert generic_scraper is not None - assert hasattr(generic_scraper, "url") - - # Platform scrapers (all implemented now!) - amazon_scraper = scrape_path.amazon - assert amazon_scraper is not None - assert hasattr(amazon_scraper, "scrape") - assert hasattr(amazon_scraper, "products") - - linkedin_scraper = scrape_path.linkedin - assert linkedin_scraper is not None - assert hasattr(linkedin_scraper, "scrape") - assert hasattr(linkedin_scraper, "jobs") - - chatgpt_scraper = scrape_path.chatgpt - assert chatgpt_scraper is not None - assert hasattr(chatgpt_scraper, "prompt") - - print("\n✅ Hierarchical access pattern is intuitive!") - print(" - client.scrape_url() ✅ (working)") - print(" - client.scrape.amazon.products() ✅ (working)") - print(" - client.scrape.linkedin.jobs() ✅ (working)") - print(" - client.scrape.chatgpt.prompt() ✅ (working)") - print(" - client.search.google() 🚧 (planned)") - print(" - client.crawler.discover() 🚧 (planned)") - - -class TestPhilosophicalPrinciples: - """Test SDK follows stated philosophical principles.""" - - def test_client_is_single_source_of_truth(self, api_token): - """Test client is single source of truth for configuration.""" - client = BrightDataClient(token=api_token, timeout=60, web_unlocker_zone="custom_zone") - - # Configuration should be accessible from client - assert client.timeout == 60 - assert client.web_unlocker_zone == "custom_zone" - - # Services should reference client configuration - assert client.scrape._client is client - assert client.search._client is client - assert client.crawler._client is client - - def test_authentication_just_works(self): - """Test authentication 'just works' with minimal setup.""" - # With environment variable - should just work - try: - client = BrightDataClient() - assert client.token is not None - print("\n✅ Authentication works automatically from environment!") - except Exception: - pytest.skip("Environment variable not set") - - def test_fails_fast_on_missing_credentials(self): - """Test SDK fails fast when credentials missing.""" - from unittest.mock import patch - - # Should fail immediately on initialization - with patch.dict(os.environ, {}, clear=True): - try: - BrightDataClient() - pytest.fail("Should have raised error immediately") - except Exception as e: - # Should fail fast, not during first API call - assert "token" in str(e).lower() - print("\n✅ Fails fast on missing credentials!") - - def test_follows_principle_of_least_surprise(self, api_token): - """Test SDK follows principle of least surprise.""" - client = BrightDataClient(token=api_token) - - # Service properties should return same instance (cached) - scrape1 = client.scrape - scrape2 = client.scrape - assert scrape1 is scrape2 - - # Token should be accessible - assert client.token is not None - - # Repr should be informative - repr_str = repr(client) - assert "BrightDataClient" in repr_str - - print("\n✅ Follows principle of least surprise!") - print(f" Client repr: {repr_str}") - - -# Helper function for interactive testing -def demo_client_usage(): - """ - Demo function showing ideal client usage. - - This demonstrates the desired user experience. - """ - # Simple instantiation - auto-loads from env - client = BrightDataClient() - - # Or with explicit token - client = BrightDataClient(token="your_token") - - # Service access - hierarchical and intuitive - # client.scrape.amazon.products(...) - # client.search.linkedin.jobs(...) - # client.crawler.discover(...) - - # Connection verification - # is_valid = await client.test_connection() - # info = client.get_account_info() - - return client - - -if __name__ == "__main__": - """Run a quick demo of the client.""" - print("=" * 80) - print("BrightDataClient Demo") - print("=" * 80) - - try: - client = BrightDataClient() - print(f"✅ Client initialized: {client}") - print("✅ Token loaded from environment") - print("✅ Services available: scrape, search, crawler") - print() - print("Example usage:") - print(" result = client.scrape_url('https://example.com')") - print(" results = client.search.google('python scraping')") - print(" pages = client.crawler.discover('https://example.com')") - except Exception as e: - print(f"❌ Error: {e}") diff --git a/tests/e2e/test_simple_scrape.py b/tests/e2e/test_simple_scrape.py deleted file mode 100644 index 88210e1..0000000 --- a/tests/e2e/test_simple_scrape.py +++ /dev/null @@ -1 +0,0 @@ -"""E2E test for simple scraping.""" diff --git a/tests/enes/amazon.py b/tests/enes/amazon.py deleted file mode 100644 index e9fe520..0000000 --- a/tests/enes/amazon.py +++ /dev/null @@ -1,132 +0,0 @@ -#!/usr/bin/env python3 -"""Test Amazon scraper to verify API fetches data correctly. - -How to run manually: - python tests/enes/amazon.py -""" - -import sys -import asyncio -from pathlib import Path - -sys.path.insert(0, str(Path(__file__).parent.parent.parent / "src")) - -from brightdata import BrightDataClient - - -async def test_amazon_products(): - """Test Amazon product scraping.""" - - print("=" * 60) - print("AMAZON SCRAPER TEST - Products") - print("=" * 60) - - client = BrightDataClient() - - async with client.engine: - scraper = client.scrape.amazon - async with scraper.engine: - print("\n🛒 Testing Amazon product scraping...") - print("📍 Product URL: https://www.amazon.com/dp/B0CRMZHDG8") - - try: - result = await scraper.products( - url="https://www.amazon.com/dp/B0CRMZHDG8", timeout=240 - ) - - print("\n✅ API call succeeded") - print(f"⏱️ Elapsed: {result.elapsed_ms():.2f}ms" if result.elapsed_ms() else "") - - print("\n📊 Result analysis:") - print(f" - result.success: {result.success}") - print(f" - result.data type: {type(result.data)}") - print( - f" - result.status: {result.status if hasattr(result, 'status') else 'N/A'}" - ) - print(f" - result.error: {result.error if hasattr(result, 'error') else 'N/A'}") - - if result.data: - print("\n✅ Got product data:") - if isinstance(result.data, dict): - print(f" - Title: {result.data.get('title', 'N/A')}") - print(f" - Price: {result.data.get('price', 'N/A')}") - print(f" - ASIN: {result.data.get('asin', 'N/A')}") - print(f" - Rating: {result.data.get('rating', 'N/A')}") - print(f" - Review Count: {result.data.get('reviews_count', 'N/A')}") - else: - print(f" Data: {result.data}") - else: - print("\n❌ No product data returned") - - except Exception as e: - print(f"\n❌ Error: {e}") - import traceback - - traceback.print_exc() - - -async def test_amazon_reviews(): - """Test Amazon reviews scraping.""" - - print("\n\n" + "=" * 60) - print("AMAZON SCRAPER TEST - Reviews") - print("=" * 60) - - client = BrightDataClient() - - async with client.engine: - scraper = client.scrape.amazon - async with scraper.engine: - print("\n📝 Testing Amazon reviews scraping...") - print("📍 Product URL: https://www.amazon.com/dp/B0CRMZHDG8") - print("📋 Parameters: pastDays=30, numOfReviews=10") - - try: - result = await scraper.reviews( - url="https://www.amazon.com/dp/B0CRMZHDG8", - pastDays=30, - numOfReviews=10, - timeout=240, - ) - - print("\n✅ API call succeeded") - print(f"⏱️ Elapsed: {result.elapsed_ms():.2f}ms" if result.elapsed_ms() else "") - - print("\n📊 Result analysis:") - print(f" - result.success: {result.success}") - print(f" - result.data type: {type(result.data)}") - print( - f" - result.status: {result.status if hasattr(result, 'status') else 'N/A'}" - ) - print(f" - result.error: {result.error if hasattr(result, 'error') else 'N/A'}") - - if result.data: - if isinstance(result.data, list): - print(f"\n✅ Got {len(result.data)} reviews:") - for i, review in enumerate(result.data[:3], 1): - print(f"\n Review {i}:") - print(f" - Rating: {review.get('rating', 'N/A')}") - print(f" - Title: {review.get('title', 'N/A')[:60]}...") - print(f" - Author: {review.get('author', 'N/A')}") - elif isinstance(result.data, dict): - reviews = result.data.get("reviews", []) - print(f"\n✅ Got {len(reviews)} reviews") - else: - print(f" Data: {result.data}") - else: - print("\n❌ No reviews data returned") - - except Exception as e: - print(f"\n❌ Error: {e}") - import traceback - - traceback.print_exc() - - -if __name__ == "__main__": - print("\n🚀 Starting Amazon Scraper Tests\n") - asyncio.run(test_amazon_products()) - asyncio.run(test_amazon_reviews()) - print("\n" + "=" * 60) - print("✅ Amazon tests completed") - print("=" * 60) diff --git a/tests/enes/amazon_search.py b/tests/enes/amazon_search.py deleted file mode 100644 index e4a1831..0000000 --- a/tests/enes/amazon_search.py +++ /dev/null @@ -1,172 +0,0 @@ -#!/usr/bin/env python3 -""" -Test NEW Amazon Search API Feature (client.search.amazon) - -This tests the NEW parameter-based Amazon search functionality: -- client.search.amazon.products(keyword="laptop", min_price=..., etc.) - -This is DIFFERENT from the old URL-based approach which gets blocked. - -python -m tests.enes.amazon_search -python tests/enes/amazon_search.py -""" - -import sys -import asyncio -from pathlib import Path - -sys.path.insert(0, str(Path(__file__).parent / "src")) - -from brightdata import BrightDataClient - - -async def test_new_amazon_search_api(): - """Test the NEW Amazon Search API""" - print("\n" + "=" * 80) - print("TESTING: NEW client.search.amazon API") - print("=" * 80) - - client = BrightDataClient() - - # Check if search.amazon exists - if not hasattr(client.search, "amazon"): - print("\n❌ client.search.amazon NOT FOUND!") - print(" The new Amazon search feature is not available") - return False - - print("✅ client.search.amazon found!") - - test_results = [] - - # Test 1: Basic keyword search - print("\n" + "-" * 80) - print("1️⃣ TEST: Basic Keyword Search") - print("-" * 80) - print(" Method: client.search.amazon.products(keyword='laptop')") - - try: - async with client.engine: - result = await client.search.amazon.products(keyword="laptop") - - print(" ✅ API call succeeded") - print(f" Success: {result.success}") - print(f" Status: {result.status}") - - if result.success: - if isinstance(result.data, dict) and "error" in result.data: - print(f" ⚠️ Crawler blocked by Amazon: {result.data['error']}") - print(" (This is expected - Amazon blocks search pages)") - test_results.append(True) # API worked, Amazon blocked - elif isinstance(result.data, list): - print(f" ✅ SUCCESS! Got {len(result.data)} products") - test_results.append(True) - else: - print(f" ⚠️ Unexpected data type: {type(result.data)}") - test_results.append(False) - else: - print(f" ❌ Search failed: {result.error}") - test_results.append(False) - - except Exception as e: - print(f" ❌ Exception: {str(e)}") - test_results.append(False) - - # Test 2: Search with price filters - print("\n" + "-" * 80) - print("2️⃣ TEST: Keyword + Price Filters") - print("-" * 80) - print(" Method: client.search.amazon.products(") - print(" keyword='headphones',") - print(" min_price=5000, # $50") - print(" max_price=20000 # $200") - print(" )") - - try: - async with client.engine: - result = await client.search.amazon.products( - keyword="headphones", min_price=5000, max_price=20000 - ) - - print(" ✅ API call succeeded") - print(f" Success: {result.success}") - - if result.success: - if isinstance(result.data, dict) and "error" in result.data: - print(" ⚠️ Crawler blocked by Amazon") - test_results.append(True) - elif isinstance(result.data, list): - print(f" ✅ SUCCESS! Got {len(result.data)} products") - test_results.append(True) - else: - test_results.append(False) - else: - print(f" ❌ Search failed: {result.error}") - test_results.append(False) - - except Exception as e: - print(f" ❌ Exception: {str(e)}") - test_results.append(False) - - # Test 3: Prime eligible filter - print("\n" + "-" * 80) - print("3️⃣ TEST: Prime Eligible Filter") - print("-" * 80) - print(" Method: client.search.amazon.products(") - print(" keyword='phone charger',") - print(" prime_eligible=True") - print(" )") - - try: - async with client.engine: - result = await client.search.amazon.products( - keyword="phone charger", prime_eligible=True - ) - - print(" ✅ API call succeeded") - print(f" Success: {result.success}") - - if result.success: - if isinstance(result.data, dict) and "error" in result.data: - print(" ⚠️ Crawler blocked by Amazon") - test_results.append(True) - elif isinstance(result.data, list): - print(f" ✅ SUCCESS! Got {len(result.data)} products") - test_results.append(True) - else: - test_results.append(False) - else: - print(f" ❌ Search failed: {result.error}") - test_results.append(False) - - except Exception as e: - print(f" ❌ Exception: {str(e)}") - test_results.append(False) - - # Final summary - print("\n" + "=" * 80) - print("TEST RESULTS SUMMARY") - print("=" * 80) - - passed = sum(test_results) - total = len(test_results) - - print(f" Passed: {passed}/{total}") - - if passed == total: - print("\n✅ ALL TESTS PASSED!") - print("\n📊 Analysis:") - print(" ✅ NEW client.search.amazon API is working") - print(" ✅ SDK correctly builds search URLs from keywords") - print(" ✅ SDK correctly triggers/polls/fetches results") - print(" ⚠️ Amazon may still block searches (anti-bot protection)") - print("\n💡 Key Difference:") - print(" OLD: client.scrape.amazon.products('https://amazon.com/s?k=laptop')") - print(" NEW: client.search.amazon.products(keyword='laptop')") - return True - else: - print(f"\n❌ {total - passed} test(s) failed") - return False - - -if __name__ == "__main__": - asyncio.run(test_new_amazon_search_api()) diff --git a/tests/enes/chatgpt.py b/tests/enes/chatgpt.py deleted file mode 100644 index dc66153..0000000 --- a/tests/enes/chatgpt.py +++ /dev/null @@ -1,190 +0,0 @@ -#!/usr/bin/env python3 -"""Test ChatGPT scraper to verify API fetches data correctly. - -How to run manually: - python tests/enes/chatgpt.py -""" - -import asyncio -import sys -from pathlib import Path - -sys.path.insert(0, str(Path(__file__).parent.parent.parent / "src")) - -from brightdata import BrightDataClient - - -async def test_chatgpt_single_prompt(): - """Test ChatGPT single prompt.""" - - print("=" * 60) - print("CHATGPT SCRAPER TEST - Single Prompt") - print("=" * 60) - - client = BrightDataClient() - - async with client.engine: - scraper = client.scrape.chatgpt - print("\n🤖 Testing ChatGPT single prompt...") - print("📋 Prompt: 'Explain async programming in Python in 2 sentences'") - - try: - result = await scraper.prompt( - prompt="Explain async programming in Python in 2 sentences", - web_search=False, - poll_timeout=180, - ) - - print("\n✅ API call succeeded") - if result.elapsed_ms(): - print(f"⏱️ Elapsed: {result.elapsed_ms():.2f}ms") - - print("\n📊 Result analysis:") - print(f" - result.success: {result.success}") - print(f" - result.data type: {type(result.data)}") - - if result.data: - print("\n✅ Got ChatGPT response:") - if isinstance(result.data, list) and len(result.data) > 0: - response = result.data[0] - print(f" - Answer: {response.get('answer_text', 'N/A')[:200]}...") - print(f" - Model: {response.get('model', 'N/A')}") - print(f" - Country: {response.get('country', 'N/A')}") - elif isinstance(result.data, dict): - print(f" - Answer: {result.data.get('answer_text', 'N/A')[:200]}...") - print(f" - Model: {result.data.get('model', 'N/A')}") - elif isinstance(result.data, str): - print(f" - Response: {result.data[:200]}...") - else: - print(f" Unexpected data type: {type(result.data)}") - else: - print("\n❌ No response data returned") - - except Exception as e: - print(f"\n❌ Error: {e}") - import traceback - - traceback.print_exc() - - -async def test_chatgpt_web_search(): - """Test ChatGPT prompt with web search enabled.""" - - print("\n\n" + "=" * 60) - print("CHATGPT SCRAPER TEST - Web Search") - print("=" * 60) - - client = BrightDataClient() - - async with client.engine: - scraper = client.scrape.chatgpt - print("\n🔍 Testing ChatGPT with web search...") - print("📋 Prompt: 'What are the latest developments in AI in 2025?'") - print("🌐 Web search: Enabled") - - try: - result = await scraper.prompt( - prompt="What are the latest developments in AI in 2025?", - web_search=True, - poll_timeout=180, - ) - - print("\n✅ API call succeeded") - if result.elapsed_ms(): - print(f"⏱️ Elapsed: {result.elapsed_ms():.2f}ms") - - print("\n📊 Result analysis:") - print(f" - result.success: {result.success}") - print(f" - result.data type: {type(result.data)}") - - if result.data: - print("\n✅ Got ChatGPT response with web search:") - if isinstance(result.data, list) and len(result.data) > 0: - response = result.data[0] - print(f" - Answer: {response.get('answer_text', 'N/A')[:200]}...") - print(f" - Model: {response.get('model', 'N/A')}") - print( - f" - Web search triggered: {response.get('web_search_triggered', False)}" - ) - elif isinstance(result.data, dict): - print(f" - Answer: {result.data.get('answer_text', 'N/A')[:200]}...") - print( - f" - Web search triggered: {result.data.get('web_search_triggered', False)}" - ) - elif isinstance(result.data, str): - print(f" - Response: {result.data[:200]}...") - else: - print(f" Unexpected data type: {type(result.data)}") - else: - print("\n❌ No response data returned") - - except Exception as e: - print(f"\n❌ Error: {e}") - import traceback - - traceback.print_exc() - - -async def test_chatgpt_multiple_prompts(): - """Test ChatGPT batch prompts.""" - - print("\n\n" + "=" * 60) - print("CHATGPT SCRAPER TEST - Multiple Prompts") - print("=" * 60) - - client = BrightDataClient() - - async with client.engine: - scraper = client.scrape.chatgpt - print("\n📝 Testing ChatGPT batch prompts...") - print("📋 Prompts: ['What is Python?', 'What is JavaScript?']") - - try: - result = await scraper.prompts( - prompts=[ - "What is Python in one sentence?", - "What is JavaScript in one sentence?", - ], - web_searches=[False, False], - poll_timeout=180, - ) - - print("\n✅ API call succeeded") - if result.elapsed_ms(): - print(f"⏱️ Elapsed: {result.elapsed_ms():.2f}ms") - - print("\n📊 Result analysis:") - print(f" - result.success: {result.success}") - print(f" - result.data type: {type(result.data)}") - - if result.data: - if isinstance(result.data, list): - print(f"\n✅ Got {len(result.data)} responses:") - for i, response in enumerate(result.data, 1): - print(f"\n Response {i}:") - if isinstance(response, dict): - print(f" - Prompt: {response.get('input', {}).get('prompt', 'N/A')}") - print(f" - Answer: {response.get('answer_text', 'N/A')[:150]}...") - print(f" - Model: {response.get('model', 'N/A')}") - else: - print(f" - Response: {str(response)[:100]}...") - else: - print(f" Unexpected data type: {type(result.data)}") - else: - print("\n❌ No responses returned") - - except Exception as e: - print(f"\n❌ Error: {e}") - import traceback - - traceback.print_exc() - - -if __name__ == "__main__": - print("\n🚀 Starting ChatGPT Scraper Tests\n") - asyncio.run(test_chatgpt_single_prompt()) - asyncio.run(test_chatgpt_web_search()) - asyncio.run(test_chatgpt_multiple_prompts()) - print("\n" + "=" * 60) - print("✅ ChatGPT tests completed") - print("=" * 60) diff --git a/tests/enes/chatgpt_02.py b/tests/enes/chatgpt_02.py deleted file mode 100644 index 476b8c3..0000000 --- a/tests/enes/chatgpt_02.py +++ /dev/null @@ -1,245 +0,0 @@ -#!/usr/bin/env python3 -"""Test ChatGPT scraper functionality. - -Tests the ChatGPT prompt-based interface and verifies it works correctly. - -How to run manually: - python probe_tests/test_07_chatgpt.py -""" - -import sys -import asyncio -from pathlib import Path - -sys.path.insert(0, str(Path(__file__).parent.parent / "src")) - -from brightdata import BrightDataClient - - -async def test_chatgpt(): - """Test ChatGPT functionality.""" - - print("Testing ChatGPT Scraper") - print("=" * 60) - - # Initialize client - client = BrightDataClient() - - print(f"\n📍 Using bearer token: {client.token[:20]}...") - - # Initialize engine context - ALL operations must be within this context - async with client.engine: - - # Test 1: Basic single prompt - print("\n1. Testing basic single prompt...") - try: - prompt = "What is 2+2?" - print(f" Prompt: '{prompt}'") - print(" Web search: False") - print(" Country: US (default)") - - scraper = client.scrape.chatgpt - result = await scraper.prompt(prompt=prompt, web_search=False, poll_timeout=60) - - if result.success: - print(" ✅ Prompt successful!") - print(f" Data type: {type(result.data)}") - if result.elapsed_ms(): - print(f" Elapsed: {result.elapsed_ms():.2f}ms") - if result.cost: - print(f" Cost: ${result.cost:.6f}") - - # Show response - if result.data and len(result.data) > 0: - response = result.data[0] - print("\n Response:") - print(f" - Answer: {response.get('answer_text', 'N/A')[:100]}...") - print(f" - Model: {response.get('model', 'N/A')}") - print(f" - Country: {response.get('country', 'N/A')}") - else: - print(" ⚠️ No response data") - else: - print(f" ❌ Prompt failed: {result.error}") - - except Exception as e: - print(f" ❌ Error: {e}") - - # Test 2: Prompt with web search - print("\n2. Testing prompt with web search...") - try: - prompt = "What are the latest AI developments in 2025?" - print(f" Prompt: '{prompt}'") - print(" Web search: True") - print(" Country: US") - - result = await scraper.prompt( - prompt=prompt, country="us", web_search=True, poll_timeout=90 - ) - - if result.success: - print(" ✅ Web search prompt successful!") - print(f" Results count: {len(result.data) if result.data else 0}") - - if result.data and len(result.data) > 0: - response = result.data[0] - print(f" - Answer preview: {response.get('answer_text', 'N/A')[:150]}...") - print(f" - Web search used: {response.get('web_search_triggered', False)}") - else: - print(f" ❌ Failed: {result.error}") - - except Exception as e: - print(f" ❌ Error: {e}") - - # Test 3: Batch prompts - print("\n3. Testing batch prompts...") - try: - prompts = ["What is Python in one sentence?", "What is JavaScript in one sentence?"] - print(f" Prompts: {prompts}") - print(" Countries: ['us', 'us']") - - result = await scraper.prompts( - prompts=prompts, - countries=["us", "us"], - web_searches=[False, False], - poll_timeout=120, - ) - - if result.success: - print(" ✅ Batch prompts successful!") - print(f" Responses: {len(result.data) if result.data else 0}") - - if result.data: - for i, response in enumerate(result.data[:2], 1): - print(f"\n Response {i}:") - print(f" - Prompt: {response.get('input', {}).get('prompt', 'N/A')}") - print(f" - Answer: {response.get('answer_text', 'N/A')[:100]}...") - print(f" - Country: {response.get('country', 'N/A')}") - else: - print(f" ❌ Failed: {result.error}") - - except Exception as e: - print(f" ❌ Error: {e}") - - # Test 4: Follow-up prompt (additional_prompt) - print("\n4. Testing follow-up prompt...") - try: - prompt = "What is machine learning?" - follow_up = "Can you give a simple example?" - print(f" Initial prompt: '{prompt}'") - print(f" Follow-up: '{follow_up}'") - - result = await scraper.prompt( - prompt=prompt, additional_prompt=follow_up, web_search=False, poll_timeout=90 - ) - - if result.success: - print(" ✅ Follow-up prompt successful!") - - if result.data and len(result.data) > 0: - response = result.data[0] - print(f" - Combined answer: {response.get('answer_text', 'N/A')[:200]}...") - else: - print(f" ❌ Failed: {result.error}") - - except Exception as e: - print(f" ❌ Error: {e}") - - # Test 5: Verify ChatGPT doesn't support URL scraping - print("\n5. Verifying URL scraping is disabled...") - try: - # This should raise NotImplementedError - await scraper.scrape_async("https://example.com") - print(" ❌ scrape_async() should have raised NotImplementedError") - except NotImplementedError as e: - print(" ✅ Correctly raises NotImplementedError") - print(f" - Message: {str(e)[:60]}...") - except Exception as e: - print(f" ❌ Unexpected error: {e}") - - # Test 6: Check ChatGPT-specific attributes - print("\n6. Checking ChatGPT-specific configuration...") - try: - print(f" Dataset ID: {scraper.DATASET_ID}") - print(f" Platform name: {scraper.PLATFORM_NAME}") - print(f" Min poll timeout: {scraper.MIN_POLL_TIMEOUT}s") - print(f" Cost per record: ${scraper.COST_PER_RECORD}") - - # Verify these are ChatGPT-specific values - checks = [ - scraper.DATASET_ID == "gd_m7aof0k82r803d5bjm", - scraper.PLATFORM_NAME == "chatgpt", - scraper.COST_PER_RECORD == 0.005, # ChatGPT is more expensive - ] - - if all(checks): - print(" ✅ All ChatGPT-specific attributes correct") - else: - print(" ⚠️ Some attributes don't match expected values") - - except Exception as e: - print(f" ❌ Error: {e}") - - # Test 7: Manual trigger/status/fetch workflow - print("\n7. Testing manual trigger/status/fetch...") - try: - prompt = "What is 1+1?" - print(f" Prompt: '{prompt}'") - - # Trigger only - job = await scraper.prompt_trigger(prompt=prompt) - print(f" ✅ Triggered job: {job.snapshot_id}") - - # Check status - status = await scraper.prompt_status(job.snapshot_id) - print(f" Initial status: {status}") - - # Poll until ready - max_attempts = 30 - for attempt in range(max_attempts): - status = await scraper.prompt_status(job.snapshot_id) - if status == "ready": - print(f" Status ready after {attempt + 1} checks") - break - elif status == "error": - print(" ❌ Job failed with error status") - break - await asyncio.sleep(2) - - # Fetch results - if status == "ready": - data = await scraper.prompt_fetch(job.snapshot_id) - print(" ✅ Fetched data successfully") - if data and len(data) > 0: - print(f" - Answer: {data[0].get('answer_text', 'N/A')[:100]}...") - - except Exception as e: - print(f" ❌ Error: {e}") - - print("\n" + "=" * 60) - print("SUMMARY:") - print("-" * 40) - print( - """ -ChatGPT Scraper Configuration: -- Dataset ID: gd_m7aof0k82r803d5bjm -- Platform: chatgpt -- Cost per prompt: $0.005 -- Default timeout: 120s (longer for AI responses) - -Key differences from regular scrapers: -1. Uses prompt/prompts methods instead of scrape -2. Requires prompt parameter, not URLs -3. Supports web_search and additional_prompt options -4. Higher cost per operation -5. Longer response times - -If getting errors: -1. Check API token is valid -2. Verify account has ChatGPT access enabled -3. Check account balance for ChatGPT operations -""" - ) - - -if __name__ == "__main__": - asyncio.run(test_chatgpt()) diff --git a/tests/enes/facebook.py b/tests/enes/facebook.py deleted file mode 100644 index 21a2578..0000000 --- a/tests/enes/facebook.py +++ /dev/null @@ -1,299 +0,0 @@ -#!/usr/bin/env python3 -"""Test Facebook scraper to verify API fetches data correctly. - -How to run manually: - python tests/enes/facebook.py -""" - -import sys -import asyncio -from pathlib import Path - -sys.path.insert(0, str(Path(__file__).parent.parent.parent / "src")) - -from brightdata import BrightDataClient - - -async def test_facebook_posts_by_profile(): - """Test Facebook posts by profile scraping.""" - - print("=" * 60) - print("FACEBOOK SCRAPER TEST - Posts by Profile") - print("=" * 60) - - client = BrightDataClient() - - async with client.engine: - scraper = client.scrape.facebook - async with scraper.engine: - print("\n👤 Testing Facebook posts by profile...") - print("📍 Profile URL: https://www.facebook.com/facebook") - print("📋 Parameters: num_of_posts=5") - - try: - result = await scraper.posts_by_profile( - url="https://www.facebook.com/facebook", num_of_posts=5, timeout=240 - ) - - print("\n✅ API call succeeded") - print(f"⏱️ Elapsed: {result.elapsed_ms():.2f}ms" if result.elapsed_ms() else "") - - print("\n📊 Result analysis:") - print(f" - result.success: {result.success}") - print(f" - result.data type: {type(result.data)}") - - if result.data: - if isinstance(result.data, list): - print(f"\n✅ Got {len(result.data)} posts:") - for i, post in enumerate(result.data[:3], 1): - print(f"\n Post {i}:") - print( - f" - Text: {post.get('text', 'N/A')[:60]}..." - if post.get("text") - else " - Text: N/A" - ) - print(f" - Likes: {post.get('likes', 'N/A')}") - print(f" - Comments: {post.get('comments', 'N/A')}") - print(f" - Shares: {post.get('shares', 'N/A')}") - elif isinstance(result.data, dict): - print("\n✅ Got post data:") - print(f" - Text: {result.data.get('text', 'N/A')[:60]}...") - print(f" - Likes: {result.data.get('likes', 'N/A')}") - else: - print(f" Data: {result.data}") - else: - print("\n❌ No post data returned") - - except Exception as e: - print(f"\n❌ Error: {e}") - import traceback - - traceback.print_exc() - - -async def test_facebook_posts_by_group(): - """Test Facebook posts by group scraping.""" - - print("\n\n" + "=" * 60) - print("FACEBOOK SCRAPER TEST - Posts by Group") - print("=" * 60) - - client = BrightDataClient() - - async with client.engine: - scraper = client.scrape.facebook - async with scraper.engine: - print("\n🏢 Testing Facebook posts by group...") - print("📍 Group URL: https://www.facebook.com/groups/example") - print("📋 Parameters: num_of_posts=5") - - try: - result = await scraper.posts_by_group( - url="https://www.facebook.com/groups/example", num_of_posts=5, timeout=240 - ) - - print("\n✅ API call succeeded") - print(f"⏱️ Elapsed: {result.elapsed_ms():.2f}ms" if result.elapsed_ms() else "") - - print("\n📊 Result analysis:") - print(f" - result.success: {result.success}") - print(f" - result.data type: {type(result.data)}") - - if result.data: - if isinstance(result.data, list): - print(f"\n✅ Got {len(result.data)} posts:") - for i, post in enumerate(result.data[:3], 1): - print(f"\n Post {i}:") - print( - f" - Text: {post.get('text', 'N/A')[:60]}..." - if post.get("text") - else " - Text: N/A" - ) - print(f" - Author: {post.get('author', 'N/A')}") - print(f" - Likes: {post.get('likes', 'N/A')}") - elif isinstance(result.data, dict): - print("\n✅ Got post data") - else: - print(f" Data: {result.data}") - else: - print("\n❌ No post data returned") - - except Exception as e: - print(f"\n❌ Error: {e}") - import traceback - - traceback.print_exc() - - -async def test_facebook_posts_by_url(): - """Test Facebook specific post scraping.""" - - print("\n\n" + "=" * 60) - print("FACEBOOK SCRAPER TEST - Post by URL") - print("=" * 60) - - client = BrightDataClient() - - async with client.engine: - scraper = client.scrape.facebook - async with scraper.engine: - print("\n📄 Testing Facebook specific post...") - print("📍 Post URL: https://www.facebook.com/facebook/posts/123456789") - - try: - result = await scraper.posts_by_url( - url="https://www.facebook.com/facebook/posts/123456789", timeout=240 - ) - - print("\n✅ API call succeeded") - print(f"⏱️ Elapsed: {result.elapsed_ms():.2f}ms" if result.elapsed_ms() else "") - - print("\n📊 Result analysis:") - print(f" - result.success: {result.success}") - print(f" - result.data type: {type(result.data)}") - - if result.data: - print("\n✅ Got post data:") - if isinstance(result.data, dict): - print( - f" - Text: {result.data.get('text', 'N/A')[:60]}..." - if result.data.get("text") - else " - Text: N/A" - ) - print(f" - Likes: {result.data.get('likes', 'N/A')}") - print(f" - Comments: {result.data.get('comments', 'N/A')}") - print(f" - Shares: {result.data.get('shares', 'N/A')}") - print(f" - Posted: {result.data.get('posted_date', 'N/A')}") - else: - print(f" Data: {result.data}") - else: - print("\n❌ No post data returned") - - except Exception as e: - print(f"\n❌ Error: {e}") - import traceback - - traceback.print_exc() - - -async def test_facebook_comments(): - """Test Facebook comments scraping.""" - - print("\n\n" + "=" * 60) - print("FACEBOOK SCRAPER TEST - Comments") - print("=" * 60) - - client = BrightDataClient() - - async with client.engine: - scraper = client.scrape.facebook - async with scraper.engine: - print("\n💬 Testing Facebook comments...") - print("📍 Post URL: https://www.facebook.com/facebook/posts/123456789") - print("📋 Parameters: num_of_comments=10") - - try: - result = await scraper.comments( - url="https://www.facebook.com/facebook/posts/123456789", - num_of_comments=10, - timeout=240, - ) - - print("\n✅ API call succeeded") - print(f"⏱️ Elapsed: {result.elapsed_ms():.2f}ms" if result.elapsed_ms() else "") - - print("\n📊 Result analysis:") - print(f" - result.success: {result.success}") - print(f" - result.data type: {type(result.data)}") - - if result.data: - if isinstance(result.data, list): - print(f"\n✅ Got {len(result.data)} comments:") - for i, comment in enumerate(result.data[:3], 1): - print(f"\n Comment {i}:") - print( - f" - Text: {comment.get('text', 'N/A')[:60]}..." - if comment.get("text") - else " - Text: N/A" - ) - print(f" - Author: {comment.get('author', 'N/A')}") - print(f" - Likes: {comment.get('likes', 'N/A')}") - elif isinstance(result.data, dict): - comments = result.data.get("comments", []) - print(f"\n✅ Got {len(comments)} comments") - else: - print(f" Data: {result.data}") - else: - print("\n❌ No comments data returned") - - except Exception as e: - print(f"\n❌ Error: {e}") - import traceback - - traceback.print_exc() - - -async def test_facebook_reels(): - """Test Facebook reels scraping.""" - - print("\n\n" + "=" * 60) - print("FACEBOOK SCRAPER TEST - Reels") - print("=" * 60) - - client = BrightDataClient() - - async with client.engine: - scraper = client.scrape.facebook - async with scraper.engine: - print("\n🎥 Testing Facebook reels...") - print("📍 Profile URL: https://www.facebook.com/facebook") - print("📋 Parameters: num_of_posts=5") - - try: - result = await scraper.reels( - url="https://www.facebook.com/facebook", num_of_posts=5, timeout=240 - ) - - print("\n✅ API call succeeded") - print(f"⏱️ Elapsed: {result.elapsed_ms():.2f}ms" if result.elapsed_ms() else "") - - print("\n📊 Result analysis:") - print(f" - result.success: {result.success}") - print(f" - result.data type: {type(result.data)}") - - if result.data: - if isinstance(result.data, list): - print(f"\n✅ Got {len(result.data)} reels:") - for i, reel in enumerate(result.data[:3], 1): - print(f"\n Reel {i}:") - print( - f" - Text: {reel.get('text', 'N/A')[:60]}..." - if reel.get("text") - else " - Text: N/A" - ) - print(f" - Views: {reel.get('views', 'N/A')}") - print(f" - Likes: {reel.get('likes', 'N/A')}") - elif isinstance(result.data, dict): - print("\n✅ Got reel data") - else: - print(f" Data: {result.data}") - else: - print("\n❌ No reels data returned") - - except Exception as e: - print(f"\n❌ Error: {e}") - import traceback - - traceback.print_exc() - - -if __name__ == "__main__": - print("\n🚀 Starting Facebook Scraper Tests\n") - asyncio.run(test_facebook_posts_by_profile()) - asyncio.run(test_facebook_posts_by_group()) - asyncio.run(test_facebook_posts_by_url()) - asyncio.run(test_facebook_comments()) - asyncio.run(test_facebook_reels()) - print("\n" + "=" * 60) - print("✅ Facebook tests completed") - print("=" * 60) diff --git a/tests/enes/get_dataset_metadata.py b/tests/enes/get_dataset_metadata.py deleted file mode 100644 index 8ffe811..0000000 --- a/tests/enes/get_dataset_metadata.py +++ /dev/null @@ -1,74 +0,0 @@ -#!/usr/bin/env python3 -"""Get dataset metadata to understand correct input parameters.""" - -import sys -import asyncio -import json -from pathlib import Path - -sys.path.insert(0, str(Path(__file__).parent.parent.parent / "src")) - -from brightdata import BrightDataClient - - -async def get_metadata(dataset_id: str, name: str): - """Fetch and display dataset metadata.""" - - print(f"\n{'=' * 60}") - print(f"{name} - Dataset Metadata") - print(f"Dataset ID: {dataset_id}") - print(f"{'=' * 60}") - - client = BrightDataClient() - - async with client.engine: - try: - url = f"{client.engine.BASE_URL}/datasets/{dataset_id}/metadata" - - async with client.engine.get_from_url(url) as response: - if response.status == 200: - data = await response.json() - - print("\n✅ Got metadata!") - - # Display input schema - if "input_schema" in data: - print("\n📋 INPUT SCHEMA:") - print(json.dumps(data["input_schema"], indent=2)) - - # Display other useful info - if "name" in data: - print(f"\nName: {data['name']}") - if "description" in data: - print(f"Description: {data['description'][:200]}...") - - else: - error_text = await response.text() - print(f"\n❌ API call failed (HTTP {response.status})") - print(f"Error: {error_text}") - - except Exception as e: - print(f"\n❌ Error: {e}") - - -async def main(): - """Get metadata for key datasets.""" - - datasets = [ - ("gd_l7q7dkf244hwjntr0", "Amazon Products"), - ("gd_le8e811kzy4ggddlq", "Amazon Reviews"), - ("gd_l1viktl72bvl7bjuj0", "LinkedIn Profiles"), - ("gd_l1vikfnt1wgvvqz95w", "LinkedIn Companies"), - ("gd_lpfll7v5hcqtkxl6l", "LinkedIn Jobs"), - ("gd_l1vikfch901nx3by4", "Instagram Profiles"), - ("gd_lk5ns7kz21pck8jpis", "Instagram Posts"), - ("gd_lkaxegm826bjpoo9m5", "Facebook Posts by Profile"), - ] - - for dataset_id, name in datasets: - await get_metadata(dataset_id, name) - await asyncio.sleep(0.5) # Rate limiting - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/tests/enes/get_datasets.py b/tests/enes/get_datasets.py deleted file mode 100644 index 688910c..0000000 --- a/tests/enes/get_datasets.py +++ /dev/null @@ -1,82 +0,0 @@ -#!/usr/bin/env python3 -"""Get list of available datasets from Bright Data API.""" - -import sys -import asyncio -from pathlib import Path - -sys.path.insert(0, str(Path(__file__).parent.parent.parent / "src")) - -from brightdata import BrightDataClient - - -async def get_datasets(): - """Fetch and display available datasets.""" - - print("=" * 60) - print("BRIGHT DATA - Available Datasets") - print("=" * 60) - - client = BrightDataClient() - - async with client.engine: - print("\n🔍 Fetching dataset list from API...") - - try: - # Make API call to get dataset list - url = f"{client.engine.BASE_URL}/datasets/list" - print(f"📡 URL: {url}") - - async with client.engine.get_from_url(url) as response: - if response.status == 200: - data = await response.json() - - print("\n✅ Got response!") - print(f"📊 Response type: {type(data)}") - - if isinstance(data, list): - print(f"📋 Found {len(data)} datasets\n") - - # Group by platform - platforms = {} - for dataset in data: - name = dataset.get("name", "unknown") - dataset_id = dataset.get("id", "unknown") - - # Extract platform from name - platform = name.split("_")[0] if "_" in name else name - - if platform not in platforms: - platforms[platform] = [] - platforms[platform].append({"name": name, "id": dataset_id}) - - # Display grouped results - for platform, datasets in sorted(platforms.items()): - print(f"\n🔹 {platform.upper()}") - for ds in datasets: - print(f" {ds['name']}: {ds['id']}") - - elif isinstance(data, dict): - print("\n📦 Response data:") - import json - - print(json.dumps(data, indent=2)) - - else: - print("\n⚠️ Unexpected response format") - print(f"Data: {data}") - - else: - error_text = await response.text() - print(f"\n❌ API call failed (HTTP {response.status})") - print(f"Error: {error_text}") - - except Exception as e: - print(f"\n❌ Error: {e}") - import traceback - - traceback.print_exc() - - -if __name__ == "__main__": - asyncio.run(get_datasets()) diff --git a/tests/enes/instagram.py b/tests/enes/instagram.py deleted file mode 100644 index 91bb749..0000000 --- a/tests/enes/instagram.py +++ /dev/null @@ -1,212 +0,0 @@ -#!/usr/bin/env python3 -"""Test Instagram scraper and search to verify API fetches data correctly. - -How to run manually: - python tests/enes/instagram.py -""" - -import sys -import asyncio -from pathlib import Path - -sys.path.insert(0, str(Path(__file__).parent.parent.parent / "src")) - -from brightdata import BrightDataClient - - -async def test_instagram_profiles(): - """Test Instagram profile scraping.""" - - print("=" * 60) - print("INSTAGRAM SCRAPER TEST - Profiles") - print("=" * 60) - - client = BrightDataClient() - - async with client.engine: - scraper = client.scrape.instagram - async with scraper.engine: - print("\n👤 Testing Instagram profile scraping...") - print("📍 Profile URL: https://www.instagram.com/instagram") - - try: - result = await scraper.profiles( - url="https://www.instagram.com/instagram", timeout=180 - ) - - print("\n✅ API call succeeded") - print(f"⏱️ Elapsed: {result.elapsed_ms():.2f}ms" if result.elapsed_ms() else "") - - print("\n📊 Result analysis:") - print(f" - result.success: {result.success}") - print(f" - result.data type: {type(result.data)}") - - if result.data: - print("\n✅ Got profile data:") - if isinstance(result.data, dict): - print(f" - Username: {result.data.get('username', 'N/A')}") - print(f" - Full Name: {result.data.get('full_name', 'N/A')}") - print(f" - Followers: {result.data.get('followers', 'N/A')}") - print(f" - Following: {result.data.get('following', 'N/A')}") - print(f" - Posts: {result.data.get('posts_count', 'N/A')}") - print(f" - Bio: {result.data.get('bio', 'N/A')[:60]}...") - else: - print(f" Data: {result.data}") - else: - print("\n❌ No profile data returned") - - except Exception as e: - print(f"\n❌ Error: {e}") - import traceback - - traceback.print_exc() - - -async def test_instagram_posts(): - """Test Instagram post scraping.""" - - print("\n\n" + "=" * 60) - print("INSTAGRAM SCRAPER TEST - Posts") - print("=" * 60) - - client = BrightDataClient() - - async with client.engine: - scraper = client.scrape.instagram - async with scraper.engine: - print("\n📸 Testing Instagram post scraping...") - print("📍 Post URL: https://www.instagram.com/p/C9z9z9z9z9z") - - try: - result = await scraper.posts( - url="https://www.instagram.com/p/C9z9z9z9z9z", timeout=180 - ) - - print("\n✅ API call succeeded") - print(f"⏱️ Elapsed: {result.elapsed_ms():.2f}ms" if result.elapsed_ms() else "") - - print("\n📊 Result analysis:") - print(f" - result.success: {result.success}") - print(f" - result.data type: {type(result.data)}") - - if result.data: - print("\n✅ Got post data:") - if isinstance(result.data, dict): - print(f" - Caption: {result.data.get('caption', 'N/A')[:60]}...") - print(f" - Likes: {result.data.get('likes', 'N/A')}") - print(f" - Comments: {result.data.get('comments_count', 'N/A')}") - print(f" - Posted: {result.data.get('timestamp', 'N/A')}") - else: - print(f" Data: {result.data}") - else: - print("\n❌ No post data returned") - - except Exception as e: - print(f"\n❌ Error: {e}") - import traceback - - traceback.print_exc() - - -async def test_instagram_reels(): - """Test Instagram reel scraping.""" - - print("\n\n" + "=" * 60) - print("INSTAGRAM SCRAPER TEST - Reels") - print("=" * 60) - - client = BrightDataClient() - - async with client.engine: - scraper = client.scrape.instagram - async with scraper.engine: - print("\n🎥 Testing Instagram reel scraping...") - print("📍 Reel URL: https://www.instagram.com/reel/ABC123") - - try: - result = await scraper.reels( - url="https://www.instagram.com/reel/ABC123", timeout=180 - ) - - print("\n✅ API call succeeded") - print(f"⏱️ Elapsed: {result.elapsed_ms():.2f}ms" if result.elapsed_ms() else "") - - print("\n📊 Result analysis:") - print(f" - result.success: {result.success}") - print(f" - result.data type: {type(result.data)}") - - if result.data: - print("\n✅ Got reel data:") - if isinstance(result.data, dict): - print(f" - Caption: {result.data.get('caption', 'N/A')[:60]}...") - print(f" - Likes: {result.data.get('likes', 'N/A')}") - print(f" - Views: {result.data.get('views', 'N/A')}") - print(f" - Comments: {result.data.get('comments_count', 'N/A')}") - else: - print(f" Data: {result.data}") - else: - print("\n❌ No reel data returned") - - except Exception as e: - print(f"\n❌ Error: {e}") - import traceback - - traceback.print_exc() - - -async def test_instagram_search_posts(): - """Test Instagram post search/discovery.""" - - print("\n\n" + "=" * 60) - print("INSTAGRAM SEARCH TEST - Posts") - print("=" * 60) - - client = BrightDataClient() - - async with client.engine: - scraper = client.search.instagram - async with scraper.engine: - print("\n🔍 Testing Instagram post search...") - print("📋 Search: profile url, num_of_posts=10") - - try: - result = await scraper.posts( - url="https://www.instagram.com/instagram", num_of_posts=10, timeout=180 - ) - - print("\n✅ API call succeeded") - print(f"⏱️ Elapsed: {result.elapsed_ms():.2f}ms" if result.elapsed_ms() else "") - - print("\n📊 Result analysis:") - print(f" - result.success: {result.success}") - print(f" - result.data type: {type(result.data)}") - - if result.data: - if isinstance(result.data, list): - print(f"\n✅ Got {len(result.data)} post results:") - for i, post in enumerate(result.data[:3], 1): - print(f"\n Post {i}:") - print(f" - Caption: {post.get('caption', 'N/A')[:50]}...") - print(f" - Likes: {post.get('likes', 'N/A')}") - print(f" - Comments: {post.get('comments_count', 'N/A')}") - else: - print(f" Data: {result.data}") - else: - print("\n❌ No search results returned") - - except Exception as e: - print(f"\n❌ Error: {e}") - import traceback - - traceback.print_exc() - - -if __name__ == "__main__": - print("\n🚀 Starting Instagram Scraper & Search Tests\n") - asyncio.run(test_instagram_profiles()) - asyncio.run(test_instagram_posts()) - asyncio.run(test_instagram_reels()) - asyncio.run(test_instagram_search_posts()) - print("\n" + "=" * 60) - print("✅ Instagram tests completed") - print("=" * 60) diff --git a/tests/enes/linkedin.py b/tests/enes/linkedin.py deleted file mode 100644 index 908e601..0000000 --- a/tests/enes/linkedin.py +++ /dev/null @@ -1,214 +0,0 @@ -#!/usr/bin/env python3 -"""Test LinkedIn scraper and search to verify API fetches data correctly. - -How to run manually: - python tests/enes/linkedin.py -""" - -import sys -import asyncio -from pathlib import Path - -sys.path.insert(0, str(Path(__file__).parent.parent.parent / "src")) - -from brightdata import BrightDataClient - - -async def test_linkedin_profiles(): - """Test LinkedIn profile scraping.""" - - print("=" * 60) - print("LINKEDIN SCRAPER TEST - Profiles") - print("=" * 60) - - client = BrightDataClient() - - async with client.engine: - scraper = client.scrape.linkedin - async with scraper.engine: - print("\n👤 Testing LinkedIn profile scraping...") - print("📍 Profile URL: https://www.linkedin.com/in/williamhgates") - - try: - result = await scraper.profiles( - url="https://www.linkedin.com/in/williamhgates", timeout=180 - ) - - print("\n✅ API call succeeded") - print(f"⏱️ Elapsed: {result.elapsed_ms():.2f}ms" if result.elapsed_ms() else "") - - print("\n📊 Result analysis:") - print(f" - result.success: {result.success}") - print(f" - result.data type: {type(result.data)}") - - if result.data: - print("\n✅ Got profile data:") - if isinstance(result.data, dict): - print(f" - Name: {result.data.get('name', 'N/A')}") - print(f" - Headline: {result.data.get('headline', 'N/A')}") - print(f" - Location: {result.data.get('location', 'N/A')}") - print(f" - Connections: {result.data.get('connections', 'N/A')}") - else: - print(f" Data: {result.data}") - else: - print("\n❌ No profile data returned") - - except Exception as e: - print(f"\n❌ Error: {e}") - import traceback - - traceback.print_exc() - - -async def test_linkedin_companies(): - """Test LinkedIn company scraping.""" - - print("\n\n" + "=" * 60) - print("LINKEDIN SCRAPER TEST - Companies") - print("=" * 60) - - client = BrightDataClient() - - async with client.engine: - scraper = client.scrape.linkedin - async with scraper.engine: - print("\n🏢 Testing LinkedIn company scraping...") - print("📍 Company URL: https://www.linkedin.com/company/microsoft") - - try: - result = await scraper.companies( - url="https://www.linkedin.com/company/microsoft", timeout=180 - ) - - print("\n✅ API call succeeded") - print(f"⏱️ Elapsed: {result.elapsed_ms():.2f}ms" if result.elapsed_ms() else "") - - print("\n📊 Result analysis:") - print(f" - result.success: {result.success}") - print(f" - result.data type: {type(result.data)}") - - if result.data: - print("\n✅ Got company data:") - if isinstance(result.data, dict): - print(f" - Name: {result.data.get('name', 'N/A')}") - print(f" - Industry: {result.data.get('industry', 'N/A')}") - print(f" - Size: {result.data.get('company_size', 'N/A')}") - print(f" - Website: {result.data.get('website', 'N/A')}") - else: - print(f" Data: {result.data}") - else: - print("\n❌ No company data returned") - - except Exception as e: - print(f"\n❌ Error: {e}") - import traceback - - traceback.print_exc() - - -async def test_linkedin_jobs(): - """Test LinkedIn job scraping.""" - - print("\n\n" + "=" * 60) - print("LINKEDIN SCRAPER TEST - Jobs") - print("=" * 60) - - client = BrightDataClient() - - async with client.engine: - scraper = client.scrape.linkedin - async with scraper.engine: - print("\n💼 Testing LinkedIn job scraping...") - print("📍 Job URL: https://www.linkedin.com/jobs/view/3787241244") - - try: - result = await scraper.jobs( - url="https://www.linkedin.com/jobs/view/3787241244", timeout=180 - ) - - print("\n✅ API call succeeded") - print(f"⏱️ Elapsed: {result.elapsed_ms():.2f}ms" if result.elapsed_ms() else "") - - print("\n📊 Result analysis:") - print(f" - result.success: {result.success}") - print(f" - result.data type: {type(result.data)}") - - if result.data: - print("\n✅ Got job data:") - if isinstance(result.data, dict): - print(f" - Title: {result.data.get('title', 'N/A')}") - print(f" - Company: {result.data.get('company', 'N/A')}") - print(f" - Location: {result.data.get('location', 'N/A')}") - print(f" - Posted: {result.data.get('posted_date', 'N/A')}") - else: - print(f" Data: {result.data}") - else: - print("\n❌ No job data returned") - - except Exception as e: - print(f"\n❌ Error: {e}") - import traceback - - traceback.print_exc() - - -async def test_linkedin_search_jobs(): - """Test LinkedIn job search.""" - - print("\n\n" + "=" * 60) - print("LINKEDIN SEARCH TEST - Jobs") - print("=" * 60) - - client = BrightDataClient() - - async with client.engine: - scraper = client.search.linkedin - async with scraper.engine: - print("\n🔍 Testing LinkedIn job search...") - print("📋 Search: keyword='python developer', location='New York'") - - try: - result = await scraper.jobs( - keyword="python developer", location="New York", timeout=180 - ) - - print("\n✅ API call succeeded") - print(f"⏱️ Elapsed: {result.elapsed_ms():.2f}ms" if result.elapsed_ms() else "") - - print("\n📊 Result analysis:") - print(f" - result.success: {result.success}") - print(f" - result.data type: {type(result.data)}") - print( - f" - result.status: {result.status if hasattr(result, 'status') else 'N/A'}" - ) - print(f" - result.error: {result.error if hasattr(result, 'error') else 'N/A'}") - - if result.data: - if isinstance(result.data, list): - print(f"\n✅ Got {len(result.data)} job results:") - for i, job in enumerate(result.data[:3], 1): - print(f"\n Job {i}:") - print(f" - Title: {job.get('title', 'N/A')}") - print(f" - Company: {job.get('company', 'N/A')}") - print(f" - Location: {job.get('location', 'N/A')}") - else: - print(f" Data: {result.data}") - else: - print("\n❌ No search results returned") - - except Exception as e: - print(f"\n❌ Error: {e}") - import traceback - - traceback.print_exc() - - -if __name__ == "__main__": - print("\n🚀 Starting LinkedIn Scraper & Search Tests\n") - asyncio.run(test_linkedin_profiles()) - asyncio.run(test_linkedin_companies()) - asyncio.run(test_linkedin_jobs()) - asyncio.run(test_linkedin_search_jobs()) - print("\n" + "=" * 60) - print("✅ LinkedIn tests completed") - print("=" * 60) diff --git a/tests/enes/serp.py b/tests/enes/serp.py deleted file mode 100644 index 4226e05..0000000 --- a/tests/enes/serp.py +++ /dev/null @@ -1,124 +0,0 @@ -#!/usr/bin/env python3 -"""Simple test to demonstrate SERP API raw HTML issue. - -How to run manually: - python probe_tests/test_04_serp_google_simple.py -""" - -import sys -import asyncio -from pathlib import Path - -sys.path.insert(0, str(Path(__file__).parent.parent / "src")) - -from brightdata import BrightDataClient - - -async def test_serp_raw_html_issue(): - """Test showing SERP returns raw HTML that SDK can't parse.""" - - print("SERP API Raw HTML Issue Demonstration") - print("=" * 60) - - # Initialize client with serp_api1 zone - client = BrightDataClient(serp_zone="sdk_serp") - - # Initialize engine context - async with client.engine: - print("\n🔍 Searching for 'pizza' using Google SERP API...") - print(f"📍 Zone: {client.serp_zone}") - print("📋 Payload sent to API: format='json' (hardcoded in SDK)") - - try: - # Make the search request - result = await client.search.google(query="pizza") - - print("\n✅ API call succeeded") - print(f"⏱️ Elapsed: {result.elapsed_ms():.2f}ms" if result.elapsed_ms() else "") - - # Show what we got back - print("\n📊 Result analysis:") - print(f" - result.success: {result.success}") - print(f" - result.data type: {type(result.data)}") - print(f" - result.data length: {len(result.data) if result.data else 0}") - - if result.data and len(result.data) > 0: - print(f"\n✅ Got {len(result.data)} parsed results") - first = result.data[0] - print(f" First result: {first}") - else: - print("\n❌ Got 0 results (empty list)") - print("\n🔍 Why this happens:") - print(" 1. SDK sends: format='json' (expecting parsed data)") - print( - " 2. API returns: {'status_code': 200, 'headers': {...}, 'body': '...'}" - ) - print(" 3. SDK's normalizer looks for 'organic' field but finds 'body' with HTML") - print(" 4. Normalizer returns empty list since it can't parse HTML") - - # Make a direct API call to show what's really returned - print("\n📡 Making direct API call to show actual response...") - from brightdata.api.serp import GoogleSERPService - - service = GoogleSERPService( - engine=client.engine, - timeout=client.timeout, - ) - - # Temporarily modify the normalizer to show raw data - original_normalize = service.data_normalizer.normalize - raw_response = None - - def capture_raw(data): - nonlocal raw_response - raw_response = data - return original_normalize(data) - - service.data_normalizer.normalize = capture_raw - - # Make the request - await service.search(query="pizza", zone=client.serp_zone) - - if raw_response: - print("\n📦 Raw API response structure:") - if isinstance(raw_response, dict): - for key in raw_response.keys(): - value = raw_response[key] - if key == "body" and isinstance(value, str): - print(f" - {key}: HTML string ({len(value)} chars)") - print(f" First 200 chars: {value[:200]}...") - elif key == "headers": - print(f" - {key}: {{...}} (response headers)") - else: - print(f" - {key}: {value}") - - print("\n⚠️ The problem:") - print( - " - SDK expects: {'organic': [...], 'ads': [...], 'featured_snippet': {...}}" - ) - print( - " - API returns: {'status_code': 200, 'headers': {...}, 'body': ''}" - ) - print(" - Result: SDK can't extract search results from raw HTML") - - except Exception as e: - print(f"\n❌ Error: {e}") - - print("\n" + "=" * 60) - print("SUMMARY:") - print("-" * 40) - print( - """ -The SERP API returns raw HTML but the SDK expects parsed JSON. -This is why all SERP searches return 0 results. - -To fix this, either: -1. The SERP zone needs to return parsed data (not raw HTML) -2. The SDK needs an HTML parser (BeautifulSoup, etc.) -3. A different Bright Data service/endpoint should be used -""" - ) - - -if __name__ == "__main__": - asyncio.run(test_serp_raw_html_issue()) diff --git a/tests/enes/web_unlocker.py b/tests/enes/web_unlocker.py deleted file mode 100644 index df34ae0..0000000 --- a/tests/enes/web_unlocker.py +++ /dev/null @@ -1,233 +0,0 @@ -#!/usr/bin/env python3 -"""Test Web Unlocker (Generic Scraper) to verify API fetches data correctly. - -How to run manually: - python tests/enes/web_unlocker.py -""" - -import sys -import asyncio -import json -from pathlib import Path - -sys.path.insert(0, str(Path(__file__).parent.parent.parent / "src")) - -from brightdata import BrightDataClient - -# Create samples directory -SAMPLES_DIR = Path(__file__).parent.parent / "samples" / "web_unlocker" -SAMPLES_DIR.mkdir(parents=True, exist_ok=True) - - -async def test_web_unlocker_single_url(): - """Test Web Unlocker with a single URL.""" - - print("=" * 60) - print("WEB UNLOCKER TEST - Single URL") - print("=" * 60) - - client = BrightDataClient() - - async with client.engine: - print("\n🌐 Testing Web Unlocker with single URL...") - print("📍 URL: https://httpbin.org/html") - - try: - result = await client.scrape_url(url="https://httpbin.org/html", response_format="raw") - - print("\n✅ API call succeeded") - print(f"⏱️ Elapsed: {result.elapsed_ms():.2f}ms" if result.elapsed_ms() else "") - - print("\n📊 Result analysis:") - print(f" - result.success: {result.success}") - print(f" - result.data type: {type(result.data)}") - print(f" - result.status: {result.status if hasattr(result, 'status') else 'N/A'}") - print(f" - result.error: {result.error if hasattr(result, 'error') else 'N/A'}") - print(f" - result.method: {result.method if hasattr(result, 'method') else 'N/A'}") - - if result.data: - print("\n✅ Got data:") - if isinstance(result.data, str): - print(f" - Data length: {len(result.data)} characters") - print(f" - First 200 chars: {result.data[:200]}...") - print(f" - Contains HTML: {' 10: - print(f" ... and {len(zones4) - 10} more") - - return True - - except Exception as e: - print(f"\n❌ Error: {e}") - import traceback - - traceback.print_exc() - return False - - -if __name__ == "__main__": - asyncio.run(demo_caching()) diff --git a/tests/enes/zones/clean_zones.py b/tests/enes/zones/clean_zones.py deleted file mode 100644 index 7ccbb11..0000000 --- a/tests/enes/zones/clean_zones.py +++ /dev/null @@ -1,154 +0,0 @@ -#!/usr/bin/env python3 -""" -Cleanup script to delete test zones created during SDK testing. - -This script will: -1. List all zones -2. Identify test zones (matching patterns) -3. Ask for confirmation -4. Delete the selected zones -""" - -import os -import sys -import asyncio -from pathlib import Path - -sys.path.insert(0, str(Path(__file__).parent.parent.parent / "src")) - -from brightdata import BrightDataClient -from brightdata.exceptions import ZoneError - - -async def cleanup_test_zones(): - """Clean up test zones.""" - - print("\n" + "=" * 70) - print("CLEANUP TEST ZONES") - print("=" * 70) - - if not os.environ.get("BRIGHTDATA_API_TOKEN"): - print("\n❌ ERROR: No API token found") - return False - - client = BrightDataClient(validate_token=False) - - # Patterns to identify test zones - test_patterns = [ - "sdk_unlocker_", - "sdk_serp_", - "test_", - ] - - # Zones to KEEP (don't delete these) - keep_zones = [ - "residential", - "mobile", - "sdk_unlocker", # Original zones without timestamps - "sdk_serp", - ] - - try: - async with client: - print("\n📊 Fetching all zones...") - all_zones = await client.list_zones() - print(f"✅ Found {len(all_zones)} total zones") - - # Identify test zones - test_zones = [] - for zone in all_zones: - zone_name = zone.get("name", "") - - # Skip zones we want to keep - if zone_name in keep_zones: - continue - - # Check if it matches test patterns - if any(pattern in zone_name for pattern in test_patterns): - test_zones.append(zone) - - if not test_zones: - print("\n✅ No test zones found to clean up!") - return True - - print(f"\n🔍 Found {len(test_zones)} test zones to clean up:") - print("-" * 70) - for i, zone in enumerate(test_zones, 1): - zone_name = zone.get("name") - zone_type = zone.get("type", "unknown") - print(f" {i:2d}. {zone_name} ({zone_type})") - - print("-" * 70) - print(f"\n⚠️ This will delete {len(test_zones)} zones!") - print(" Zones to KEEP: " + ", ".join(keep_zones)) - - # Ask for confirmation - response = input("\n❓ Delete these zones? (yes/no): ").strip().lower() - - if response not in ["yes", "y"]: - print("\n❌ Cleanup cancelled by user") - return False - - # Delete zones - print(f"\n🗑️ Deleting {len(test_zones)} zones...") - deleted_count = 0 - failed_count = 0 - - for i, zone in enumerate(test_zones, 1): - zone_name = zone.get("name") - try: - print(f" [{i}/{len(test_zones)}] Deleting '{zone_name}'...", end=" ") - await client.delete_zone(zone_name) - print("✅") - deleted_count += 1 - - # Small delay to avoid rate limiting - if i % 5 == 0: - await asyncio.sleep(0.5) - - except ZoneError as e: - print(f"❌ ({e})") - failed_count += 1 - except Exception as e: - print(f"❌ ({e})") - failed_count += 1 - - # Wait a bit for changes to propagate - await asyncio.sleep(2) - - # Verify - print("\n🔍 Verifying cleanup...") - final_zones = await client.list_zones() - print(f"✅ Current zone count: {len(final_zones)}") - - # Summary - print("\n" + "=" * 70) - print("📊 CLEANUP SUMMARY:") - print("=" * 70) - print(f" Initial zones: {len(all_zones)}") - print(f" Test zones found: {len(test_zones)}") - print(f" Successfully deleted: {deleted_count}") - print(f" Failed to delete: {failed_count}") - print(f" Final zone count: {len(final_zones)}") - print(f" Zones freed: {len(all_zones) - len(final_zones)}") - - print("\n✅ CLEANUP COMPLETED!") - print("=" * 70) - - return True - - except Exception as e: - print(f"\n❌ Error: {e}") - import traceback - - traceback.print_exc() - return False - - -if __name__ == "__main__": - try: - success = asyncio.run(cleanup_test_zones()) - sys.exit(0 if success else 1) - except KeyboardInterrupt: - print("\n\n⚠️ Cleanup interrupted by user") - sys.exit(2) diff --git a/tests/enes/zones/crud_zones.py b/tests/enes/zones/crud_zones.py deleted file mode 100644 index b8bdd24..0000000 --- a/tests/enes/zones/crud_zones.py +++ /dev/null @@ -1,300 +0,0 @@ -#!/usr/bin/env python3 -""" -Comprehensive CRUD test for Zone Management. - -This test performs a complete cycle: -1. CREATE - Create new test zones -2. READ - List zones and verify they exist -3. UPDATE - (Not supported by API, zones are immutable) -4. DELETE - Delete test zones -5. VERIFY - Confirm zones appear/disappear in dashboard - -Tests that zones appear in the Bright Data dashboard. -""" - -import os -import sys -import asyncio -import time -from pathlib import Path -from typing import List - -sys.path.insert(0, str(Path(__file__).parent.parent.parent / "src")) - -from brightdata import BrightDataClient -from brightdata.exceptions import ZoneError, AuthenticationError - - -class ZoneCRUDTester: - """Test CRUD operations for zones.""" - - def __init__(self): - self.client = BrightDataClient(validate_token=False) - self.test_zones: List[str] = [] - self.timestamp = str(int(time.time()))[-6:] - - async def test_create_zones(self) -> bool: - """Test zone creation.""" - print("\n" + "=" * 70) - print("1️⃣ CREATE - Testing Zone Creation") - print("=" * 70) - - # Define test zones to create - zones_to_create = [ - (f"crud_test_unlocker_{self.timestamp}", "unblocker"), - (f"crud_test_serp_{self.timestamp}", "serp"), - ] - - self.test_zones = [name for name, _ in zones_to_create] - - print(f"\n📋 Will create {len(zones_to_create)} test zones:") - for name, ztype in zones_to_create: - print(f" - {name} ({ztype})") - - created_count = 0 - - for zone_name, zone_type in zones_to_create: - print(f"\n Creating '{zone_name}'...", end=" ") - try: - # Create zone using auto_create_zones - temp_client = BrightDataClient( - auto_create_zones=True, - web_unlocker_zone=zone_name if zone_type == "unblocker" else "sdk_unlocker", - serp_zone=zone_name if zone_type == "serp" else None, - validate_token=False, - ) - - async with temp_client: - # Trigger zone creation - try: - if zone_type == "unblocker": - await temp_client.scrape_url(url="https://example.com", zone=zone_name) - else: # serp - await temp_client.search.google(query="test", zone=zone_name) - except Exception: - # Zone might be created even if operation fails - pass - - print("✅") - created_count += 1 - await asyncio.sleep(0.5) # Small delay between creations - - except AuthenticationError as e: - print(f"❌ Auth error: {e}") - if "zone limit" in str(e).lower(): - print(" ⚠️ Zone limit reached!") - return False - except Exception as e: - print(f"❌ Error: {e}") - - print(f"\n✅ Created {created_count}/{len(zones_to_create)} zones") - return created_count > 0 - - async def test_read_zones(self) -> bool: - """Test zone listing and reading.""" - print("\n" + "=" * 70) - print("2️⃣ READ - Testing Zone Listing") - print("=" * 70) - - # Wait for zones to be fully registered - print("\n⏳ Waiting 2 seconds for zones to register...") - await asyncio.sleep(2) - - # Test list_zones() - always fresh - print("\n📋 Method 1: Using list_zones() [FRESH DATA]") - zones = await self.client.list_zones() - zone_names = {z.get("name") for z in zones} - print(f" Total zones: {len(zones)}") - - # Check if our test zones are present - found_zones = [] - missing_zones = [] - - for test_zone in self.test_zones: - if test_zone in zone_names: - found_zones.append(test_zone) - else: - missing_zones.append(test_zone) - - print("\n Our test zones:") - for zone in found_zones: - print(f" ✅ {zone}") - for zone in missing_zones: - print(f" ❌ {zone} (NOT FOUND)") - - # Test get_account_info() - with refresh - print("\n📊 Method 2: Using get_account_info(refresh=True) [FRESH DATA]") - info = await self.client.get_account_info(refresh=True) - info_zones = info.get("zones", []) - info_zone_names = {z.get("name") for z in info_zones} - print(f" Total zones: {len(info_zones)}") - print(f" Our zones present: {all(z in info_zone_names for z in self.test_zones)}") - - # Display zone details - print("\n📂 Test Zone Details:") - for zone in zones: - if zone.get("name") in self.test_zones: - print(f" 🔹 {zone.get('name')}") - print(f" Type: {zone.get('type')}") - print(f" Status: {zone.get('status', 'active')}") - - success = len(found_zones) == len(self.test_zones) - if success: - print(f"\n✅ All {len(self.test_zones)} test zones found in dashboard!") - else: - print(f"\n⚠️ Only {len(found_zones)}/{len(self.test_zones)} zones found") - - return success - - async def test_delete_zones(self) -> bool: - """Test zone deletion.""" - print("\n" + "=" * 70) - print("3️⃣ DELETE - Testing Zone Deletion") - print("=" * 70) - - print(f"\n🗑️ Deleting {len(self.test_zones)} test zones...") - - deleted_count = 0 - failed_count = 0 - - for zone_name in self.test_zones: - print(f" Deleting '{zone_name}'...", end=" ") - try: - await self.client.delete_zone(zone_name) - print("✅") - deleted_count += 1 - await asyncio.sleep(0.3) # Small delay - except ZoneError as e: - print(f"❌ {e}") - failed_count += 1 - except Exception as e: - print(f"❌ {e}") - failed_count += 1 - - print("\n📊 Deletion Summary:") - print(f" Successfully deleted: {deleted_count}") - print(f" Failed to delete: {failed_count}") - - return deleted_count > 0 - - async def verify_deletion(self) -> bool: - """Verify zones were deleted.""" - print("\n" + "=" * 70) - print("4️⃣ VERIFY - Confirming Deletion") - print("=" * 70) - - print("\n⏳ Waiting 2 seconds for deletion to propagate...") - await asyncio.sleep(2) - - print("\n🔍 Checking if zones are gone...") - zones = await self.client.list_zones() - zone_names = {z.get("name") for z in zones} - - still_present = [] - successfully_deleted = [] - - for test_zone in self.test_zones: - if test_zone in zone_names: - still_present.append(test_zone) - else: - successfully_deleted.append(test_zone) - - print("\n Zones successfully deleted:") - for zone in successfully_deleted: - print(f" ✅ {zone}") - - if still_present: - print("\n Zones still present (deletion might be delayed):") - for zone in still_present: - print(f" ⚠️ {zone}") - - print(f"\n📊 Final zone count: {len(zones)}") - - success = len(successfully_deleted) == len(self.test_zones) - if success: - print(f"✅ All {len(self.test_zones)} zones successfully deleted from dashboard!") - else: - print(f"⚠️ {len(still_present)} zone(s) still visible") - - return success - - async def run_full_test(self) -> bool: - """Run the complete CRUD test cycle.""" - print("\n" + "=" * 70) - print("🧪 ZONE CRUD TEST - Full Cycle") - print("=" * 70) - print("\nThis test will:") - print(" 1. CREATE new test zones") - print(" 2. READ/LIST zones (verify they appear in dashboard)") - print(" 3. DELETE test zones") - print(" 4. VERIFY deletion") - - try: - async with self.client: - # Get initial state - initial_zones = await self.client.list_zones() - print(f"\n📊 Initial state: {len(initial_zones)} zones in account") - - # CREATE - if not await self.test_create_zones(): - print("\n❌ Zone creation failed!") - return False - - # READ - if not await self.test_read_zones(): - print("\n⚠️ Some zones not found in dashboard") - # Continue anyway to cleanup - - # DELETE - if not await self.test_delete_zones(): - print("\n❌ Zone deletion failed!") - return False - - # VERIFY - if not await self.verify_deletion(): - print("\n⚠️ Some zones still visible after deletion") - - # Final state - final_zones = await self.client.list_zones() - print(f"\n📊 Final state: {len(final_zones)} zones in account") - print(f" Net change: {len(final_zones) - len(initial_zones)} zones") - - # Overall result - print("\n" + "=" * 70) - print("✅ CRUD TEST COMPLETED SUCCESSFULLY!") - print("=" * 70) - print("\n🎉 Summary:") - print(" ✓ Zones can be created via SDK") - print(" ✓ Zones appear in Bright Data dashboard") - print(" ✓ Zones can be listed via API") - print(" ✓ Zones can be deleted via SDK") - print(" ✓ Deletions are reflected in dashboard") - - return True - - except Exception as e: - print(f"\n❌ Test failed with error: {e}") - import traceback - - traceback.print_exc() - return False - - -async def main(): - """Main test runner.""" - if not os.environ.get("BRIGHTDATA_API_TOKEN"): - print("\n❌ ERROR: No API token found") - print("Please set BRIGHTDATA_API_TOKEN environment variable") - return False - - tester = ZoneCRUDTester() - return await tester.run_full_test() - - -if __name__ == "__main__": - try: - success = asyncio.run(main()) - sys.exit(0 if success else 1) - except KeyboardInterrupt: - print("\n\n⚠️ Test interrupted by user") - sys.exit(2) diff --git a/tests/enes/zones/dash_sync.py b/tests/enes/zones/dash_sync.py deleted file mode 100644 index 9fbaa2b..0000000 --- a/tests/enes/zones/dash_sync.py +++ /dev/null @@ -1,96 +0,0 @@ -#!/usr/bin/env python3 -""" -Verify that zones in SDK match what's shown in the Bright Data dashboard. - -This script shows that: -1. The SDK accurately reads zone data -2. Changes made via SDK are reflected in the dashboard -3. The dashboard and API are synchronized -""" - -import os -import sys -import asyncio -from pathlib import Path - -sys.path.insert(0, str(Path(__file__).parent.parent.parent / "src")) - -from brightdata import BrightDataClient - - -async def verify_dashboard_sync(): - """Verify SDK zones match dashboard.""" - - print("\n" + "=" * 70) - print("🔍 DASHBOARD SYNC VERIFICATION") - print("=" * 70) - - if not os.environ.get("BRIGHTDATA_API_TOKEN"): - print("\n❌ ERROR: No API token found") - return False - - client = BrightDataClient(validate_token=False) - - try: - async with client: - print("\n📊 Fetching zones from Bright Data API...") - zones = await client.list_zones() - - print(f"✅ Found {len(zones)} zones total\n") - - # Group zones by type - zones_by_type = {} - for zone in zones: - ztype = zone.get("type", "unknown") - if ztype not in zones_by_type: - zones_by_type[ztype] = [] - zones_by_type[ztype].append(zone) - - # Display zones grouped by type - print("📂 ZONES BY TYPE:") - print("=" * 70) - - for ztype, zlist in sorted(zones_by_type.items()): - print(f"\n🔹 {ztype.upper()} ({len(zlist)} zones)") - print("-" * 70) - for zone in sorted(zlist, key=lambda z: z.get("name", "")): - name = zone.get("name") - status = zone.get("status", "active") - print(f" • {name:40s} [{status}]") - - print("\n" + "=" * 70) - print("✅ VERIFICATION COMPLETE") - print("=" * 70) - print( - """ -These zones should match exactly what you see in your dashboard at: -https://brightdata.com/cp/zones - -📋 How to verify: - 1. Go to: https://brightdata.com/cp/zones - 2. Count the total zones shown - 3. Compare with the count above - 4. Check that zone names and types match - -✅ If they match: SDK and dashboard are in sync! -❌ If they don't: There may be a caching or API delay issue - """ - ) - - return True - - except Exception as e: - print(f"\n❌ Error: {e}") - import traceback - - traceback.print_exc() - return False - - -if __name__ == "__main__": - try: - success = asyncio.run(verify_dashboard_sync()) - sys.exit(0 if success else 1) - except KeyboardInterrupt: - print("\n⚠️ Verification interrupted") - sys.exit(2) diff --git a/tests/enes/zones/delete_zone.py b/tests/enes/zones/delete_zone.py deleted file mode 100644 index 67985aa..0000000 --- a/tests/enes/zones/delete_zone.py +++ /dev/null @@ -1,154 +0,0 @@ -#!/usr/bin/env python3 -""" -Demo script for zone deletion functionality. - -This script demonstrates: -1. Listing all zones -2. Creating a test zone -3. Verifying it exists -4. Deleting the test zone -5. Verifying it's gone -""" - -import os -import sys -import asyncio -import time -from pathlib import Path - -# Add parent directory to path -sys.path.insert(0, str(Path(__file__).parent.parent.parent / "src")) - -from brightdata import BrightDataClient -from brightdata.exceptions import ZoneError, AuthenticationError - - -async def demo_delete_zone(): - """Demonstrate zone deletion functionality.""" - - print("\n" + "=" * 60) - print("ZONE DELETION DEMO") - print("=" * 60) - - # Check for API token - if not os.environ.get("BRIGHTDATA_API_TOKEN"): - print("\n❌ ERROR: No API token found") - print("Please set BRIGHTDATA_API_TOKEN environment variable") - return False - - # Create client - client = BrightDataClient(validate_token=False) - - # Create a unique test zone name - timestamp = str(int(time.time()))[-6:] - test_zone_name = f"test_delete_zone_{timestamp}" - - try: - async with client: - # Step 1: List initial zones - print("\n📊 Step 1: Listing current zones...") - initial_zones = await client.list_zones() - {z.get("name") for z in initial_zones} - print(f"✅ Found {len(initial_zones)} zones") - - # Step 2: Create a test zone - print(f"\n🔧 Step 2: Creating test zone '{test_zone_name}'...") - test_client = BrightDataClient( - auto_create_zones=True, web_unlocker_zone=test_zone_name, validate_token=False - ) - - try: - async with test_client: - # Trigger zone creation - try: - await test_client.scrape_url(url="https://example.com", zone=test_zone_name) - except Exception as e: - # Zone might be created even if scrape fails - print(f" ℹ️ Scrape error (expected): {e}") - - print(f"✅ Test zone '{test_zone_name}' created") - except Exception as e: - print(f"❌ Failed to create test zone: {e}") - return False - - # Wait a bit for zone to be fully registered - await asyncio.sleep(2) - - # Step 3: Verify zone exists - print(f"\n🔍 Step 3: Verifying zone '{test_zone_name}' exists...") - zones_after_create = await client.list_zones() - zone_names_after_create = {z.get("name") for z in zones_after_create} - - if test_zone_name in zone_names_after_create: - print(f"✅ Zone '{test_zone_name}' found in zone list") - # Print zone details - test_zone = next(z for z in zones_after_create if z.get("name") == test_zone_name) - print(f" Type: {test_zone.get('type', 'unknown')}") - print(f" Status: {test_zone.get('status', 'unknown')}") - else: - print(f"⚠️ Zone '{test_zone_name}' not found (might still be creating)") - - # Step 4: Delete the test zone - print(f"\n🗑️ Step 4: Deleting zone '{test_zone_name}'...") - try: - await client.delete_zone(test_zone_name) - print(f"✅ Zone '{test_zone_name}' deleted successfully") - except ZoneError as e: - print(f"❌ Failed to delete zone: {e}") - return False - except AuthenticationError as e: - print(f"❌ Authentication error: {e}") - return False - - # Wait a bit for deletion to propagate - await asyncio.sleep(2) - - # Step 5: Verify zone is gone - print(f"\n🔍 Step 5: Verifying zone '{test_zone_name}' is deleted...") - final_zones = await client.list_zones() - final_zone_names = {z.get("name") for z in final_zones} - - if test_zone_name not in final_zone_names: - print(f"✅ Confirmed: Zone '{test_zone_name}' no longer exists") - else: - print( - f"⚠️ Zone '{test_zone_name}' still appears in list (deletion might be delayed)" - ) - - # Summary - print("\n" + "=" * 60) - print("📈 SUMMARY:") - print(f" Initial zones: {len(initial_zones)}") - print(f" After creation: {len(zones_after_create)}") - print(f" After deletion: {len(final_zones)}") - print(f" Net change: {len(final_zones) - len(initial_zones)}") - - print("\n" + "=" * 60) - print("✅ DEMO COMPLETED SUCCESSFULLY") - print("=" * 60) - - return True - - except Exception as e: - print(f"\n❌ Unexpected error: {e}") - import traceback - - traceback.print_exc() - return False - - -def main(): - """Main entry point.""" - try: - success = asyncio.run(demo_delete_zone()) - sys.exit(0 if success else 1) - except KeyboardInterrupt: - print("\n\n⚠️ Demo interrupted by user") - sys.exit(2) - except Exception as e: - print(f"\n❌ Fatal error: {e}") - sys.exit(3) - - -if __name__ == "__main__": - main() diff --git a/tests/enes/zones/list_zones.py b/tests/enes/zones/list_zones.py deleted file mode 100644 index 878e815..0000000 --- a/tests/enes/zones/list_zones.py +++ /dev/null @@ -1,261 +0,0 @@ -#!/usr/bin/env python3 -""" -Test 02: List and Analyze Available Zones - -This file lists all available zones in your Bright Data account and analyzes -their capabilities for different services (Web Unlocker, SERP, Browser API). - -How to run manually: - python probe_tests/test_02_list_zones.py - -Requirements: - - Valid BRIGHTDATA_API_TOKEN -""" - -import os -import sys -import json -import traceback -from pathlib import Path -from datetime import datetime - -# Add parent directory to path -sys.path.insert(0, str(Path(__file__).parent.parent / "src")) - -from brightdata import BrightDataClient -from brightdata.exceptions import AuthenticationError, APIError - - -def print_header(title): - """Print formatted header.""" - print(f"\n{'='*60}") - print(f"{title:^60}") - print(f"{'='*60}") - - -def print_section(title): - """Print section header.""" - print(f"\n{'-'*40}") - print(f"{title}") - print(f"{'-'*40}") - - -def test_list_zones(): - """List all available zones and their configurations.""" - print_header("BRIGHT DATA ZONES ANALYZER") - - try: - # Check for API token - if not os.environ.get("BRIGHTDATA_API_TOKEN"): - print("\n❌ ERROR: No API token found") - print("Please set BRIGHTDATA_API_TOKEN environment variable") - return False - - # Create client - client = BrightDataClient() - print("\n✅ Client initialized successfully") - - # Get account info - print("\nFetching account information...") - info = client.get_account_info_sync() - - # Display customer info - print_section("ACCOUNT INFORMATION") - print(f"Customer ID: {info.get('customer_id', 'Not available')}") - print(f"Token Valid: {info.get('token_valid', False)}") - print(f"Retrieved At: {info.get('retrieved_at', 'Unknown')}") - - # Analyze zones - zones = info.get("zones", []) - print(f"\nTotal Zones: {len(zones)}") - - if not zones: - print("\n⚠️ No zones found in your account") - print("\nTo create zones:") - print("1. Log in to https://brightdata.com") - print("2. Navigate to Zones section") - print("3. Create zones for Web Unlocker, SERP, or Browser API") - return False - - # List all zones with details - print_section("AVAILABLE ZONES") - - for i, zone in enumerate(zones, 1): - print(f"\nZone {i}:") - print(f" Name: {zone.get('name', 'Unknown')}") - print(f" Status: {zone.get('status', 'Unknown')}") - - # Check plan details if available - plan = zone.get("plan", {}) - if plan: - print(f" Plan Type: {plan.get('type', 'Unknown')}") - print(f" Plan Description: {plan.get('description', 'N/A')}") - - # Creation date if available - created = zone.get("created") - if created: - print(f" Created: {created}") - - # Try to determine zone capabilities based on name/plan - zone_name = zone.get("name", "").lower() - capabilities = [] - - if "unlocker" in zone_name or "unblocker" in zone_name: - capabilities.append("Web Unlocker") - if "serp" in zone_name or "search" in zone_name: - capabilities.append("SERP/Search") - if "browser" in zone_name or "scraper" in zone_name: - capabilities.append("Browser/Scraper") - if "residential" in zone_name: - capabilities.append("Residential Proxy") - if "datacenter" in zone_name: - capabilities.append("Datacenter Proxy") - - if capabilities: - print(f" Likely Capabilities: {', '.join(capabilities)}") - - # Suggest zone configuration - print_section("ZONE CONFIGURATION SUGGESTIONS") - - # Check for Web Unlocker zone - unlocker_zones = [z for z in zones if "unlocker" in z.get("name", "").lower()] - if unlocker_zones: - print(f"✅ Web Unlocker zone found: {unlocker_zones[0].get('name')}") - print(f" Use: BrightDataClient(web_unlocker_zone='{unlocker_zones[0].get('name')}')") - else: - print("❌ No Web Unlocker zone found") - print(" Suggestion: Create a zone with Web Unlocker service enabled") - - # Check for SERP zone - serp_zones = [z for z in zones if "serp" in z.get("name", "").lower()] - if serp_zones: - print(f"\n✅ SERP zone found: {serp_zones[0].get('name')}") - print(f" Use: BrightDataClient(serp_zone='{serp_zones[0].get('name')}')") - else: - print("\n❌ No SERP zone found") - print(" Suggestion: Create a zone with SERP API service enabled") - - # Check for Browser zone - browser_zones = [ - z - for z in zones - if "browser" in z.get("name", "").lower() or "scraper" in z.get("name", "").lower() - ] - if browser_zones: - print(f"\n✅ Browser/Scraper zone found: {browser_zones[0].get('name')}") - print(f" Use: BrightDataClient(browser_zone='{browser_zones[0].get('name')}')") - else: - print("\n❌ No Browser/Scraper zone found") - print(" Suggestion: Create a zone with Browser API or Web Scraper service") - - # Test zone connectivity - print_section("ZONE CONNECTIVITY TEST") - - if zones: - # Try to use the first zone for a test - first_zone = zones[0].get("name") - print(f"\nTesting with zone: {first_zone}") - - try: - # Create client with specific zone - test_client = BrightDataClient(web_unlocker_zone=first_zone) - - # Try a simple scrape - print(f"Attempting to scrape with zone '{first_zone}'...") - result = test_client.scrape_url("https://httpbin.org/html", zone=first_zone) - - if result.success: - print(f"✅ Zone '{first_zone}' is working!") - print(f" Data received: {len(str(result.data)) if result.data else 0} chars") - else: - print(f"❌ Zone '{first_zone}' returned error: {result.error}") - - except Exception as e: - print(f"❌ Zone test failed: {e}") - - # Export zones to file - print_section("EXPORT ZONES") - - export_file = Path("probe_tests/zones_config.json") - zones_data = { - "customer_id": info.get("customer_id"), - "timestamp": datetime.now().isoformat(), - "zones": zones, - "recommendations": { - "web_unlocker_zone": unlocker_zones[0].get("name") if unlocker_zones else None, - "serp_zone": serp_zones[0].get("name") if serp_zones else None, - "browser_zone": browser_zones[0].get("name") if browser_zones else None, - }, - } - - try: - export_file.write_text(json.dumps(zones_data, indent=2)) - print(f"✅ Zones configuration exported to: {export_file}") - print(" You can use this file to configure your SDK") - except Exception as e: - print(f"❌ Failed to export zones: {e}") - - # Summary - print_section("SUMMARY") - print(f"Total zones found: {len(zones)}") - print(f"Web Unlocker zones: {len(unlocker_zones)}") - print(f"SERP zones: {len(serp_zones)}") - print(f"Browser zones: {len(browser_zones)}") - - # Configuration recommendation - if zones: - print("\n📝 RECOMMENDED CLIENT CONFIGURATION:") - print("```python") - print("from brightdata import BrightDataClient") - print() - print("client = BrightDataClient(") - if unlocker_zones: - print(f' web_unlocker_zone="{unlocker_zones[0].get("name")}",') - if serp_zones: - print(f' serp_zone="{serp_zones[0].get("name")}",') - if browser_zones: - print(f' browser_zone="{browser_zones[0].get("name")}",') - print(")") - print("```") - - return True - - except AuthenticationError as e: - print(f"\n❌ Authentication failed: {e}") - print("Please check your API token") - return False - - except APIError as e: - print(f"\n❌ API error: {e}") - return False - - except Exception as e: - print(f"\n❌ Unexpected error: {e}") - traceback.print_exc() - return False - - -def main(): - """Run zone listing and analysis.""" - try: - success = test_list_zones() - - if success: - print("\n✅ Zone analysis completed successfully!") - return 0 - else: - print("\n❌ Zone analysis failed or incomplete") - return 1 - - except KeyboardInterrupt: - print("\n\n⚠️ Interrupted by user") - return 2 - - except Exception as e: - print(f"\n❌ Fatal error: {e}") - traceback.print_exc() - return 3 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/tests/enes/zones/permission.py b/tests/enes/zones/permission.py deleted file mode 100644 index 8046d29..0000000 --- a/tests/enes/zones/permission.py +++ /dev/null @@ -1,125 +0,0 @@ -#!/usr/bin/env python3 -""" -Test to demonstrate improved permission error handling. - -This test shows how the SDK now provides clear, helpful error messages -when API tokens lack zone creation permissions. -""" - -import os -import sys -import asyncio -from pathlib import Path - -sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent / "src")) - -from brightdata import BrightDataClient -from brightdata.exceptions import AuthenticationError - - -async def test_permission_error_handling(): - """Test that permission errors are caught and displayed clearly.""" - - print("\n" + "=" * 70) - print("🧪 TESTING PERMISSION ERROR HANDLING") - print("=" * 70) - - print( - """ -This test demonstrates the improved error handling when your API token -lacks zone creation permissions. - -Expected behavior: - ✅ Clear error message explaining the issue - ✅ Direct link to fix the problem - ✅ No silent failures - ✅ Helpful instructions for users - """ - ) - - if not os.environ.get("BRIGHTDATA_API_TOKEN"): - print("\n❌ ERROR: No API token found") - return False - - client = BrightDataClient( - auto_create_zones=True, web_unlocker_zone="test_permission_zone", validate_token=False - ) - - print("🔧 Attempting to create a zone with auto_create_zones=True...") - print("-" * 70) - - try: - async with client: - # This will trigger zone creation - print("\n⏳ Initializing client (will attempt zone creation)...") - print(" If your token lacks permissions, you'll see a clear error message.\n") - - # If we get here, zones were created successfully or already exist - zones = await client.list_zones() - print(f"✅ SUCCESS: Client initialized, {len(zones)} zones available") - - # Check if our test zone exists - zone_names = {z.get("name") for z in zones} - if "test_permission_zone" in zone_names: - print(" ✓ Test zone was created successfully") - print(" ✓ Your API token HAS zone creation permissions") - else: - print(" ℹ️ Test zone not created (may already exist with different name)") - - return True - - except AuthenticationError as e: - print("\n" + "=" * 70) - print("✅ PERMISSION ERROR CAUGHT (Expected if you lack permissions)") - print("=" * 70) - print(f"\nError Message:\n{e}") - print("\n" + "=" * 70) - print("📝 This is the IMPROVED error handling!") - print("=" * 70) - print( - """ -Before: Error was unclear and could fail silently -After: Clear message with actionable steps to fix the issue - -The error message should have told you: - 1. ❌ What went wrong (permission denied) - 2. 🔗 Where to fix it (https://brightdata.com/cp/setting/users) - 3. 📋 What to do (enable zone creation permission) - """ - ) - return True # This is expected behavior - - except Exception as e: - print(f"\n❌ UNEXPECTED ERROR: {e}") - import traceback - - traceback.print_exc() - return False - - -if __name__ == "__main__": - try: - success = asyncio.run(test_permission_error_handling()) - - print("\n" + "=" * 70) - if success: - print("✅ TEST PASSED") - print("=" * 70) - print( - """ -Summary: - • Permission errors are now caught and displayed clearly - • Users get actionable instructions to fix the problem - • No more silent failures - • SDK provides helpful guidance - """ - ) - else: - print("❌ TEST FAILED") - print("=" * 70) - - sys.exit(0 if success else 1) - - except KeyboardInterrupt: - print("\n⚠️ Test interrupted") - sys.exit(2) diff --git a/tests/enes/zones/test_cache.py b/tests/enes/zones/test_cache.py deleted file mode 100644 index 356c401..0000000 --- a/tests/enes/zones/test_cache.py +++ /dev/null @@ -1,95 +0,0 @@ -#!/usr/bin/env python3 -""" -Test to demonstrate the caching issue with get_account_info(). -""" - -import os -import sys -import asyncio -import time -from pathlib import Path - -sys.path.insert(0, str(Path(__file__).parent.parent.parent / "src")) - -from brightdata import BrightDataClient - - -async def test_caching_issue(): - """Demonstrate caching issue.""" - - print("\n" + "=" * 70) - print("CACHING ISSUE DEMONSTRATION") - print("=" * 70) - - if not os.environ.get("BRIGHTDATA_API_TOKEN"): - print("\n❌ ERROR: No API token found") - return False - - client = BrightDataClient( - auto_create_zones=True, - web_unlocker_zone=f"test_cache_{int(time.time()) % 100000}", - validate_token=False, - ) - - try: - async with client: - # Method 1: get_account_info() - CACHES the result - print("\n1️⃣ Using get_account_info() (first call)...") - info1 = await client.get_account_info() - zones1 = info1.get("zones", []) - print(f" Found {len(zones1)} zones via get_account_info()") - - # Method 2: list_zones() - Direct API call - print("\n2️⃣ Using list_zones() (first call)...") - zones2 = await client.list_zones() - print(f" Found {len(zones2)} zones via list_zones()") - - # Create a new zone - print("\n3️⃣ Creating a new test zone...") - test_zone = f"test_new_{int(time.time()) % 100000}" - temp = BrightDataClient( - auto_create_zones=True, web_unlocker_zone=test_zone, validate_token=False - ) - async with temp: - try: - await temp.scrape_url("https://example.com", zone=test_zone) - except Exception: - pass - print(f" Zone '{test_zone}' created") - - await asyncio.sleep(1) - - # Check again with both methods - print("\n4️⃣ Using get_account_info() (second call - CACHED)...") - info2 = await client.get_account_info() - zones3 = info2.get("zones", []) - print(f" Found {len(zones3)} zones via get_account_info()") - print(f" ⚠️ Same as before: {len(zones3) == len(zones1)}") - print(" 🔍 This is CACHED data!") - - print("\n5️⃣ Using list_zones() (second call - FRESH)...") - zones4 = await client.list_zones() - print(f" Found {len(zones4)} zones via list_zones()") - print(f" ✅ New data: {len(zones4) > len(zones2)}") - print(" 🔍 This is FRESH data from API!") - - print("\n" + "=" * 70) - print("🔍 PROBLEM IDENTIFIED:") - print(" get_account_info() caches the result (line 367-368 in client.py)") - print(" If you use get_account_info()['zones'], you'll see stale data!") - print("\n✅ SOLUTION:") - print(" Always use list_zones() to get current zone list") - print("=" * 70) - - return True - - except Exception as e: - print(f"\n❌ Error: {e}") - import traceback - - traceback.print_exc() - return False - - -if __name__ == "__main__": - asyncio.run(test_caching_issue()) diff --git a/tests/fixtures/.gitkeep b/tests/fixtures/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/tests/fixtures/mock_data/.gitkeep b/tests/fixtures/mock_data/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/tests/fixtures/responses/.gitkeep b/tests/fixtures/responses/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py index c210fac..e69de29 100644 --- a/tests/integration/__init__.py +++ b/tests/integration/__init__.py @@ -1 +0,0 @@ -"""Integration tests.""" diff --git a/tests/integration/test_browser_api.py b/tests/integration/test_browser_api.py deleted file mode 100644 index eb13cc9..0000000 --- a/tests/integration/test_browser_api.py +++ /dev/null @@ -1 +0,0 @@ -"""Integration tests for Browser API.""" diff --git a/tests/integration/test_client_integration.py b/tests/integration/test_client_integration.py deleted file mode 100644 index bf1810f..0000000 --- a/tests/integration/test_client_integration.py +++ /dev/null @@ -1,220 +0,0 @@ -"""Integration tests for BrightDataClient API calls.""" - -import os -import pytest -from pathlib import Path - -# Load environment variables from .env file -try: - from dotenv import load_dotenv - - env_file = Path(__file__).parent.parent.parent.parent / ".env" - if env_file.exists(): - load_dotenv(env_file) -except ImportError: - pass - -from brightdata import BrightDataClient, SyncBrightDataClient -from brightdata.exceptions import AuthenticationError - - -@pytest.fixture -def api_token(): - """Get API token from environment or skip tests.""" - token = os.getenv("BRIGHTDATA_API_TOKEN") - if not token: - pytest.skip("API token not found. Set BRIGHTDATA_API_TOKEN to run integration tests.") - return token - - -@pytest.fixture -def client(api_token): - """Create async client instance for testing (must be used with async context).""" - return BrightDataClient(token=api_token) - - -@pytest.fixture -def sync_client(api_token): - """Create sync client instance for testing.""" - with SyncBrightDataClient(token=api_token) as client: - yield client - - -@pytest.fixture -async def async_client(api_token): - """Create async client instance for testing.""" - async with BrightDataClient(token=api_token) as client: - yield client - - -class TestConnectionTesting: - """Test connection testing functionality.""" - - @pytest.mark.asyncio - async def test_connection_with_valid_token(self, async_client): - """Test connection succeeds with valid token.""" - is_valid = await async_client.test_connection() - - assert is_valid is True - assert async_client._is_connected is True - - @pytest.mark.asyncio - async def test_connection_with_invalid_token(self): - """Test connection returns False with invalid token.""" - client = BrightDataClient(token="invalid_token_123456789", auto_create_zones=False) - - async with client: - # test_connection() never raises - returns False for invalid tokens - is_valid = await client.test_connection() - assert is_valid is False - - def test_connection_sync_with_valid_token(self, sync_client): - """Test synchronous connection test using SyncBrightDataClient.""" - is_valid = sync_client.test_connection() - - assert is_valid is True - - -class TestAccountInfo: - """Test account information retrieval.""" - - @pytest.mark.asyncio - async def test_get_account_info_success(self, async_client): - """Test getting account info with valid token.""" - info = await async_client.get_account_info() - - assert isinstance(info, dict) - assert "zones" in info - assert "zone_count" in info - assert "token_valid" in info - assert "retrieved_at" in info - - assert info["token_valid"] is True - assert isinstance(info["zones"], list) - assert info["zone_count"] == len(info["zones"]) - - @pytest.mark.asyncio - async def test_get_account_info_returns_zones(self, async_client): - """Test account info includes zones list.""" - info = await async_client.get_account_info() - - zones = info.get("zones", []) - assert isinstance(zones, list) - - # If zones exist, check structure - if zones: - for zone in zones: - assert isinstance(zone, dict) - # Zones should have at least a name - assert "name" in zone or "zone" in zone - - @pytest.mark.asyncio - async def test_get_account_info_with_invalid_token(self): - """Test getting account info fails with invalid token.""" - client = BrightDataClient(token="invalid_token_123456789", auto_create_zones=False) - - async with client: - with pytest.raises(AuthenticationError) as exc_info: - await client.get_account_info() - - assert "Invalid token" in str(exc_info.value) or "401" in str(exc_info.value) - - def test_get_account_info_sync(self, sync_client): - """Test synchronous account info retrieval using SyncBrightDataClient.""" - info = sync_client.get_account_info() - - assert isinstance(info, dict) - assert "zones" in info - assert "token_valid" in info - - @pytest.mark.asyncio - async def test_account_info_is_cached(self, async_client): - """Test account info is cached after first retrieval.""" - # First call - info1 = await async_client.get_account_info() - - # Second call should return cached version - info2 = await async_client.get_account_info() - - assert info1 is info2 # Same object reference - assert info1["retrieved_at"] == info2["retrieved_at"] - - @pytest.mark.asyncio - async def test_account_info_includes_customer_id(self, api_token): - """Test account info includes customer ID if provided.""" - customer_id = os.getenv("BRIGHTDATA_CUSTOMER_ID") - - async with BrightDataClient(token=api_token, customer_id=customer_id) as client: - info = await client.get_account_info() - - if customer_id: - assert info.get("customer_id") == customer_id - - -class TestClientInitializationWithValidation: - """Test client initialization with token validation.""" - - def test_client_with_validate_token_true_and_valid_token(self, api_token): - """Test client initialization validates token when requested.""" - # Should not raise any exception - client = BrightDataClient(token=api_token, validate_token=True) - assert client.token == api_token - - @pytest.mark.asyncio - async def test_client_with_validate_token_true_and_invalid_token(self): - """Test client raises error on __aenter__ if token is invalid and validation enabled.""" - client = BrightDataClient( - token="invalid_token_123456789", validate_token=True, auto_create_zones=False - ) - with pytest.raises(AuthenticationError): - async with client: - pass # Should not reach here - - def test_client_with_validate_token_false_accepts_any_token(self): - """Test client accepts any token format when validation disabled.""" - # Should not raise exception even with invalid token - client = BrightDataClient(token="invalid_token_123456789", validate_token=False) - assert client.token == "invalid_token_123456789" - - -class TestLegacyAPICompatibility: - """Test backward compatibility with old flat API.""" - - @pytest.mark.asyncio - async def test_scrape_url_async_works(self, async_client): - """Test legacy scrape_url_async method works.""" - # Simple test URL - result = await async_client.scrape_url(url="https://httpbin.org/html") - - assert result is not None - assert hasattr(result, "success") - assert hasattr(result, "data") - - def test_scrape_url_sync_works(self, sync_client): - """Test scrape_url method works synchronously using SyncBrightDataClient.""" - result = sync_client.scrape_url(url="https://httpbin.org/html") - - assert result is not None - assert hasattr(result, "success") - - -class TestClientErrorHandling: - """Test client error handling in various scenarios.""" - - @pytest.mark.asyncio - async def test_connection_test_returns_false_on_network_error(self): - """Test connection test returns False (not exception) on network errors.""" - client = BrightDataClient(token="test_token_123456789", auto_create_zones=False) - - async with client: - # Should return False, not raise exception - is_valid = await client.test_connection() - # With invalid token, should return False - assert is_valid is False - - def test_sync_connection_test_returns_false_on_error(self): - """Test sync connection test returns False on errors using SyncBrightDataClient.""" - with SyncBrightDataClient(token="test_token_123456789", auto_create_zones=False) as client: - # Should return False, not raise exception - is_valid = client.test_connection() - assert is_valid is False diff --git a/tests/integration/test_crawl_api.py b/tests/integration/test_crawl_api.py deleted file mode 100644 index af7fb9c..0000000 --- a/tests/integration/test_crawl_api.py +++ /dev/null @@ -1 +0,0 @@ -"""Integration tests for Crawl API.""" diff --git a/tests/integration/test_serp_api.py b/tests/integration/test_serp_api.py deleted file mode 100644 index e95c396..0000000 --- a/tests/integration/test_serp_api.py +++ /dev/null @@ -1 +0,0 @@ -"""Integration tests for SERP API.""" diff --git a/tests/integration/test_serp_async_mode.py b/tests/integration/test_serp_async_mode.py deleted file mode 100644 index 6348649..0000000 --- a/tests/integration/test_serp_async_mode.py +++ /dev/null @@ -1,231 +0,0 @@ -"""Integration tests for SERP async mode. - -These tests verify that: -1. Sync mode still works (backwards compatibility) -2. Async mode works end-to-end -3. Default mode is sync -4. Both modes return the same normalized data structure -""" - -import os -import pytest -from pathlib import Path - -# Load environment variables from .env file -try: - from dotenv import load_dotenv - - env_file = Path(__file__).parent.parent.parent / ".env" - if env_file.exists(): - load_dotenv(env_file) -except ImportError: - pass - -from brightdata import BrightDataClient - - -@pytest.fixture -def api_token(): - """Get API token from environment or skip tests.""" - token = os.getenv("BRIGHTDATA_API_TOKEN") - if not token: - pytest.skip("API token not found. Set BRIGHTDATA_API_TOKEN to run integration tests.") - return token - - -@pytest.fixture -async def async_client(api_token): - """Create async client instance for testing.""" - async with BrightDataClient(token=api_token) as client: - yield client - - -class TestSERPAsyncMode: - """Test SERP async mode functionality.""" - - @pytest.mark.asyncio - @pytest.mark.integration - async def test_google_search_sync_mode_explicit(self, async_client): - """Test sync mode still works when explicitly specified.""" - result = await async_client.search.google( - query="python programming", zone=async_client.serp_zone, mode="sync" # Explicit sync - ) - - assert result.success is True, f"Search failed: {result.error}" - assert result.data is not None - assert len(result.data) > 0, "No search results returned" - assert result.search_engine == "google" - assert result.query["q"] == "python programming" - - @pytest.mark.asyncio - @pytest.mark.integration - async def test_google_search_default_is_sync(self, async_client): - """Test default mode is sync (backwards compatibility).""" - result = await async_client.search.google( - query="test query", - zone=async_client.serp_zone, - # No mode parameter - should default to sync - ) - - assert result.success is True, f"Search failed: {result.error}" - assert result.data is not None - assert len(result.data) > 0 - - @pytest.mark.asyncio - @pytest.mark.integration - @pytest.mark.slow - async def test_google_search_async_mode(self, async_client): - """Test async mode with polling.""" - result = await async_client.search.google( - query="python programming", - zone=async_client.serp_zone, - mode="async", - poll_interval=2, # Check every 2 seconds - poll_timeout=30, # Give up after 30 seconds - ) - - assert result.success is True, f"Async search failed: {result.error}" - assert result.data is not None - assert len(result.data) > 0, "No search results from async mode" - assert result.search_engine == "google" - assert result.query["q"] == "python programming" - - @pytest.mark.asyncio - @pytest.mark.integration - @pytest.mark.slow - async def test_async_mode_returns_same_structure_as_sync(self, async_client): - """Test that async mode returns same normalized structure as sync.""" - query = "machine learning" - - # Run sync mode - sync_result = await async_client.search.google( - query=query, zone=async_client.serp_zone, mode="sync" - ) - - # Run async mode - async_result = await async_client.search.google( - query=query, zone=async_client.serp_zone, mode="async", poll_interval=2, poll_timeout=30 - ) - - # Both should succeed - assert sync_result.success is True - assert async_result.success is True - - # Both should have data - assert sync_result.data is not None - assert async_result.data is not None - - # Both should be lists - assert isinstance(sync_result.data, list) - assert isinstance(async_result.data, list) - - # Both should have results - assert len(sync_result.data) > 0 - assert len(async_result.data) > 0 - - # Structure should be the same (both have rank, title, link, etc.) - if len(sync_result.data) > 0 and len(async_result.data) > 0: - sync_first = sync_result.data[0] - async_first = async_result.data[0] - - # Check that both have the same fields - assert "rank" in sync_first - assert "rank" in async_first - assert "title" in sync_first or "snippet" in sync_first - assert "title" in async_first or "snippet" in async_first - - @pytest.mark.asyncio - @pytest.mark.integration - async def test_async_mode_with_short_timeout(self, async_client): - """Test async mode timeout handling.""" - # Use very short timeout to force timeout error - result = await async_client.search.google( - query="test", - zone=async_client.serp_zone, - mode="async", - poll_interval=1, - poll_timeout=1, # Very short timeout - ) - - # Should fail with timeout error - assert result.success is False - assert result.error is not None - assert "timeout" in result.error.lower() - - @pytest.mark.asyncio - @pytest.mark.integration - @pytest.mark.slow - async def test_async_mode_multiple_queries(self, async_client): - """Test async mode with multiple queries (batch processing).""" - queries = ["python", "javascript", "golang"] - - results = await async_client.search.google( - query=queries, - zone=async_client.serp_zone, - mode="async", - poll_interval=2, - poll_timeout=60, # Longer timeout for multiple queries - ) - - # Should get results for all queries - assert len(results) == 3 - - # Check each result - for i, result in enumerate(results): - assert result.success is True, f"Query {i} failed: {result.error}" - assert result.data is not None - assert len(result.data) > 0 - - @pytest.mark.asyncio - @pytest.mark.integration - async def test_sync_mode_with_location(self, async_client): - """Test sync mode with location parameter.""" - result = await async_client.search.google( - query="restaurants", zone=async_client.serp_zone, location="US", mode="sync" - ) - - assert result.success is True - assert result.data is not None - - @pytest.mark.asyncio - @pytest.mark.integration - @pytest.mark.slow - async def test_async_mode_with_location(self, async_client): - """Test async mode with location parameter.""" - result = await async_client.search.google( - query="restaurants", - zone=async_client.serp_zone, - location="US", - mode="async", - poll_interval=2, - poll_timeout=30, - ) - - assert result.success is True - assert result.data is not None - - -class TestSERPAsyncModeTiming: - """Test async mode timing and performance characteristics.""" - - @pytest.mark.asyncio - @pytest.mark.integration - @pytest.mark.slow - async def test_async_mode_has_timing_metadata(self, async_client): - """Test that async mode populates timing metadata.""" - result = await async_client.search.google( - query="test", - zone=async_client.serp_zone, - mode="async", - poll_interval=2, - poll_timeout=30, - ) - - assert result.success is True - - # Check timing metadata - assert result.trigger_sent_at is not None - assert result.data_fetched_at is not None - - # Data fetch should be after trigger - assert result.data_fetched_at >= result.trigger_sent_at diff --git a/tests/integration/test_web_unlocker_api.py b/tests/integration/test_web_unlocker_api.py deleted file mode 100644 index 410cf59..0000000 --- a/tests/integration/test_web_unlocker_api.py +++ /dev/null @@ -1 +0,0 @@ -"""Integration tests for Web Unlocker API.""" diff --git a/tests/integration/test_web_unlocker_async_mode.py b/tests/integration/test_web_unlocker_async_mode.py deleted file mode 100644 index 609c0d5..0000000 --- a/tests/integration/test_web_unlocker_async_mode.py +++ /dev/null @@ -1,254 +0,0 @@ -"""Integration tests for Web Unlocker async mode. - -These tests verify that: -1. Sync mode still works (backwards compatibility) -2. Async mode works end-to-end -3. Default mode is sync -4. Both modes return the same normalized data structure -""" - -import os -import pytest -from pathlib import Path - -# Load environment variables from .env file -try: - from dotenv import load_dotenv - - env_file = Path(__file__).parent.parent.parent / ".env" - if env_file.exists(): - load_dotenv(env_file) -except ImportError: - pass - -from brightdata import BrightDataClient - - -@pytest.fixture -def api_token(): - """Get API token from environment or skip tests.""" - token = os.getenv("BRIGHTDATA_API_TOKEN") - if not token: - pytest.skip("API token not found. Set BRIGHTDATA_API_TOKEN to run integration tests.") - return token - - -@pytest.fixture -async def async_client(api_token): - """Create async client instance for testing.""" - async with BrightDataClient(token=api_token) as client: - yield client - - -class TestWebUnlockerAsyncMode: - """Test Web Unlocker async mode functionality.""" - - @pytest.mark.asyncio - @pytest.mark.integration - async def test_scrape_sync_mode_explicit(self, async_client): - """Test sync mode still works when explicitly specified.""" - result = await async_client.scrape_url( - url="https://example.com", - zone=async_client.web_unlocker_zone, - mode="sync", # Explicit sync - ) - - assert result.success is True, f"Scrape failed: {result.error}" - assert result.data is not None - assert isinstance(result.data, str) - assert len(result.data) > 0, "No data returned" - assert result.method == "web_unlocker" - - @pytest.mark.asyncio - @pytest.mark.integration - async def test_scrape_default_is_sync(self, async_client): - """Test default mode is sync (backwards compatibility).""" - result = await async_client.scrape_url( - url="https://example.com", - zone=async_client.web_unlocker_zone, - # No mode parameter - should default to sync - ) - - assert result.success is True, f"Scrape failed: {result.error}" - assert result.data is not None - assert isinstance(result.data, str) - - @pytest.mark.asyncio - @pytest.mark.integration - @pytest.mark.slow - async def test_scrape_async_mode(self, async_client): - """Test async mode with polling.""" - result = await async_client.scrape_url( - url="https://example.com", - zone=async_client.web_unlocker_zone, - mode="async", - poll_interval=2, # Check every 2 seconds - poll_timeout=30, # Give up after 30 seconds - ) - - assert result.success is True, f"Async scrape failed: {result.error}" - assert result.data is not None - assert isinstance(result.data, str) - assert len(result.data) > 0, "No data from async mode" - assert result.method == "web_unlocker" - - @pytest.mark.asyncio - @pytest.mark.integration - @pytest.mark.slow - async def test_async_mode_returns_same_structure_as_sync(self, async_client): - """Test that async mode returns same normalized structure as sync.""" - url = "https://example.com" - - # Run sync mode - sync_result = await async_client.scrape_url( - url=url, zone=async_client.web_unlocker_zone, mode="sync" - ) - - # Run async mode - async_result = await async_client.scrape_url( - url=url, - zone=async_client.web_unlocker_zone, - mode="async", - poll_interval=2, - poll_timeout=30, - ) - - # Both should succeed - assert sync_result.success is True - assert async_result.success is True - - # Both should have data - assert sync_result.data is not None - assert async_result.data is not None - - # Both should be strings (raw HTML) - assert isinstance(sync_result.data, str) - assert isinstance(async_result.data, str) - - # Both should have content - assert len(sync_result.data) > 0 - assert len(async_result.data) > 0 - - # Both should have the same method - assert sync_result.method == "web_unlocker" - assert async_result.method == "web_unlocker" - - @pytest.mark.asyncio - @pytest.mark.integration - async def test_async_mode_with_short_timeout(self, async_client): - """Test async mode timeout handling.""" - # Use very short timeout to force timeout error - result = await async_client.scrape_url( - url="https://example.com", - zone=async_client.web_unlocker_zone, - mode="async", - poll_interval=1, - poll_timeout=1, # Very short timeout - ) - - # Should fail with timeout error - assert result.success is False - assert result.error is not None - assert "timeout" in result.error.lower() - - @pytest.mark.asyncio - @pytest.mark.integration - @pytest.mark.slow - async def test_async_mode_multiple_urls(self, async_client): - """Test async mode with multiple URLs (batch processing).""" - urls = ["https://example.com", "https://www.example.org", "https://www.example.net"] - - results = await async_client.scrape_url( - url=urls, - zone=async_client.web_unlocker_zone, - mode="async", - poll_interval=2, - poll_timeout=60, # Longer timeout for multiple URLs - ) - - # Should get results for all URLs - assert len(results) == 3 - - # Check each result - for i, result in enumerate(results): - assert result.success is True, f"URL {i} failed: {result.error}" - assert result.data is not None - assert isinstance(result.data, str) - assert len(result.data) > 0 - - @pytest.mark.asyncio - @pytest.mark.integration - async def test_sync_mode_with_country(self, async_client): - """Test sync mode with country parameter.""" - result = await async_client.scrape_url( - url="https://example.com", - zone=async_client.web_unlocker_zone, - country="US", - mode="sync", - ) - - assert result.success is True - assert result.data is not None - - @pytest.mark.asyncio - @pytest.mark.integration - @pytest.mark.slow - async def test_async_mode_with_country(self, async_client): - """Test async mode with country parameter.""" - result = await async_client.scrape_url( - url="https://example.com", - zone=async_client.web_unlocker_zone, - country="US", - mode="async", - poll_interval=2, - poll_timeout=30, - ) - - assert result.success is True - assert result.data is not None - - @pytest.mark.asyncio - @pytest.mark.integration - @pytest.mark.slow - async def test_async_mode_with_json_response(self, async_client): - """Test async mode with JSON response format.""" - result = await async_client.scrape_url( - url="https://httpbin.org/json", - zone=async_client.web_unlocker_zone, - response_format="json", - mode="async", - poll_interval=2, - poll_timeout=30, - ) - - assert result.success is True - assert result.data is not None - # When response_format="json", data should be a dict - if result.success: - assert isinstance(result.data, (dict, list)) - - -class TestWebUnlockerAsyncModeTiming: - """Test async mode timing and performance characteristics.""" - - @pytest.mark.asyncio - @pytest.mark.integration - @pytest.mark.slow - async def test_async_mode_has_timing_metadata(self, async_client): - """Test that async mode populates timing metadata.""" - result = await async_client.scrape_url( - url="https://example.com", - zone=async_client.web_unlocker_zone, - mode="async", - poll_interval=2, - poll_timeout=30, - ) - - assert result.success is True - - # Check timing metadata - assert result.trigger_sent_at is not None - assert result.data_fetched_at is not None - - # Data fetch should be after trigger - assert result.data_fetched_at >= result.trigger_sent_at diff --git a/tests/readme.py b/tests/readme.py deleted file mode 100644 index 8b0a183..0000000 --- a/tests/readme.py +++ /dev/null @@ -1,1044 +0,0 @@ -""" -Tests to validate all code samples in README.md. - -This test suite ensures that all code examples in the README.md file are accurate -and functional. Tests are organized by README sections and include: -- Authentication examples -- Simple web scraping examples -- Dataclass payload examples -- Pandas integration examples -- Platform-specific scraping (Amazon, LinkedIn, ChatGPT, Facebook, Instagram) -- SERP API examples (Google, Bing, Yandex) -- Async usage examples -- CLI tool examples -- Advanced usage examples -- Complete workflow example - -All tests use real API calls (no mocking) to ensure documentation accuracy. -""" - -import os -import json -import subprocess -import pytest -from pathlib import Path - -# Load environment variables from .env file -try: - from dotenv import load_dotenv - - env_file = Path(__file__).parent.parent / ".env" - if env_file.exists(): - load_dotenv(env_file) -except ImportError: - pass - -from brightdata import BrightDataClient -from brightdata.payloads import ( - AmazonProductPayload, - LinkedInJobSearchPayload, - ChatGPTPromptPayload, -) - - -@pytest.fixture -def api_token(): - """Get API token from environment or skip tests.""" - token = os.getenv("BRIGHTDATA_API_TOKEN") - if not token: - pytest.skip("API token not found. Set BRIGHTDATA_API_TOKEN to run README validation tests.") - return token - - -@pytest.fixture -def client(api_token): - """Create synchronous client instance for testing.""" - return BrightDataClient(token=api_token) - - -@pytest.fixture -async def async_client(api_token): - """Create async client instance for testing.""" - async with BrightDataClient(token=api_token) as client: - yield client - - -class TestQuickStartAuthentication: - """Test authentication examples from Quick Start section.""" - - def test_environment_variable_auth(self, api_token): - """ - Test: README Quick Start - Authentication with environment variable. - Line: 106-107 - """ - # From README: client = BrightDataClient() - client = BrightDataClient() - - assert client is not None, "Client initialization failed" - assert client.token == api_token, "Token not loaded from environment" - - def test_direct_credentials_auth(self): - """ - Test: README Quick Start - Authentication with direct credentials. - Line: 92-98 - """ - token = os.getenv("BRIGHTDATA_API_TOKEN") - if not token: - pytest.skip("API token not found") - - customer_id = os.getenv("BRIGHTDATA_CUSTOMER_ID") - - # From README - client = BrightDataClient(token=token, customer_id=customer_id) - - assert client is not None, "Client initialization failed" - assert client.token == token, "Token not set correctly" - - -class TestQuickStartSimpleScraping: - """Test simple web scraping example from Quick Start.""" - - def test_simple_web_scraping(self, client): - """ - Test: README Quick Start - Simple Web Scraping. - Line: 101-118 - """ - # From README: - # result = client.scrape_url("https://example.com") - # if result.success: - # print(f"Success: {result.success}") - # print(f"Data: {result.data[:200]}...") - # print(f"Time: {result.elapsed_ms():.2f}ms") - - result = client.scrape_url("https://example.com") - - assert result is not None, "Result is None" - assert hasattr(result, "success"), "Result missing 'success' attribute" - assert hasattr(result, "data"), "Result missing 'data' attribute" - assert hasattr(result, "error"), "Result missing 'error' attribute" - - # Verify we can access the attributes as shown in README - if result.success: - assert result.data is not None, "data should not be None when success=True" - elapsed = result.elapsed_ms() - assert isinstance(elapsed, (int, float)), "elapsed_ms() should return number" - assert elapsed >= 0, "elapsed_ms() should be non-negative" - - -class TestDataclassPayloads: - """Test dataclass payload examples from README.""" - - def test_amazon_payload_basic(self): - """ - Test: README - Using Dataclass Payloads with Amazon. - Line: 128-135 - """ - # From README: - # payload = AmazonProductPayload( - # url="https://amazon.com/dp/B123456789", - # reviews_count=50 - # ) - # print(f"ASIN: {payload.asin}") - - payload = AmazonProductPayload(url="https://amazon.com/dp/B0CRMZHDG8", reviews_count=50) - - # Verify helper property - assert payload.asin == "B0CRMZHDG8", f"Expected ASIN 'B0CRMZHDG8', got '{payload.asin}'" - - # Verify to_dict() method - api_dict = payload.to_dict() - assert isinstance(api_dict, dict), "to_dict() should return dict" - assert "url" in api_dict, "to_dict() missing 'url' key" - - def test_linkedin_job_payload(self): - """ - Test: README - LinkedIn job search payload. - Line: 138-145 - """ - # From README: - # job_payload = LinkedInJobSearchPayload( - # keyword="python developer", - # location="New York", - # remote=True - # ) - # print(f"Remote search: {job_payload.is_remote_search}") - - job_payload = LinkedInJobSearchPayload( - keyword="python developer", location="New York", remote=True - ) - - assert job_payload.is_remote_search is True, "is_remote_search should be True" - - api_dict = job_payload.to_dict() - assert isinstance(api_dict, dict), "to_dict() should return dict" - assert "keyword" in api_dict, "to_dict() missing 'keyword'" - - def test_amazon_payload_detailed(self): - """ - Test: README - Amazon payload with helper properties. - Line: 711-723 - """ - # From README: - # payload = AmazonProductPayload( - # url="https://amazon.com/dp/B123456789", - # reviews_count=50, - # images_count=10 - # ) - # print(payload.asin) # "B123456789" - # print(payload.domain) # "amazon.com" - # print(payload.is_secure) # True - - payload = AmazonProductPayload( - url="https://amazon.com/dp/B0CRMZHDG8", reviews_count=50, images_count=10 - ) - - assert payload.asin == "B0CRMZHDG8", "ASIN extraction failed" - assert payload.domain == "amazon.com", "Domain extraction failed" - assert payload.is_secure is True, "is_secure should be True for https" - - api_dict = payload.to_dict() - assert "url" in api_dict, "to_dict() missing 'url'" - - def test_linkedin_job_payload_detailed(self): - """ - Test: README - LinkedIn payload with helper properties. - Line: 731-742 - """ - # From README: - # payload = LinkedInJobSearchPayload( - # keyword="python developer", - # location="San Francisco", - # remote=True, - # experienceLevel="mid" - # ) - # print(payload.is_remote_search) # True - - payload = LinkedInJobSearchPayload( - keyword="python developer", location="San Francisco", remote=True, experienceLevel="mid" - ) - - assert payload.is_remote_search is True, "is_remote_search should be True" - - api_dict = payload.to_dict() - assert api_dict["keyword"] == "python developer", "Keyword mismatch" - assert api_dict["remote"] is True, "Remote should be True" - - def test_chatgpt_payload_defaults(self): - """ - Test: README - ChatGPT payload with default values. - Line: 750-757 - """ - # From README: - # payload = ChatGPTPromptPayload( - # prompt="Explain async programming", - # web_search=True - # ) - # print(payload.country) # "US" (default) - # print(payload.uses_web_search) # True - - payload = ChatGPTPromptPayload(prompt="Explain async programming", web_search=True) - - assert payload.country == "US", "Default country should be 'US'" - assert payload.uses_web_search is True, "uses_web_search should be True" - - def test_payload_validation_invalid_url(self): - """ - Test: README - Payload validation for invalid URL. - Line: 764-767 - """ - # From README: - # try: - # AmazonProductPayload(url="invalid-url") - # except ValueError as e: - # print(e) # "url must be valid HTTP/HTTPS URL" - - with pytest.raises(ValueError) as exc_info: - AmazonProductPayload(url="invalid-url") - - error_msg = str(exc_info.value).lower() - assert "url" in error_msg, f"Error should mention 'url', got: {error_msg}" - - def test_payload_validation_negative_count(self): - """ - Test: README - Payload validation for negative reviews_count. - Line: 769-775 - """ - # From README: - # try: - # AmazonProductPayload( - # url="https://amazon.com/dp/B123", - # reviews_count=-1 - # ) - # except ValueError as e: - # print(e) # "reviews_count must be non-negative" - - with pytest.raises(ValueError) as exc_info: - AmazonProductPayload(url="https://amazon.com/dp/B0CRMZHDG8", reviews_count=-1) - - error_msg = str(exc_info.value).lower() - assert ( - "reviews_count" in error_msg or "negative" in error_msg - ), f"Error should mention reviews_count or negative, got: {error_msg}" - - -class TestPlatformSpecificAmazon: - """Test Amazon platform-specific examples from README.""" - - @pytest.mark.slow - def test_amazon_product_scraping(self, client): - """ - Test: README - Amazon product scraping. - Line: 183-187 - """ - # From README: - # result = client.scrape.amazon.products( - # url="https://amazon.com/dp/B0CRMZHDG8", - # timeout=65 - # ) - - result = client.scrape.amazon.products(url="https://amazon.com/dp/B0CRMZHDG8", timeout=65) - - assert result is not None, "Result is None" - assert hasattr(result, "success"), "Result missing 'success' attribute" - assert hasattr(result, "data"), "Result missing 'data' attribute" - - @pytest.mark.slow - def test_amazon_reviews_with_filters(self, client): - """ - Test: README - Amazon reviews with filters. - Line: 189-195 - """ - # From README: - # result = client.scrape.amazon.reviews( - # url="https://amazon.com/dp/B0CRMZHDG8", - # pastDays=30, - # keyWord="quality", - # numOfReviews=100 - # ) - - result = client.scrape.amazon.reviews( - url="https://amazon.com/dp/B0CRMZHDG8", - pastDays=30, - keyWord="quality", - numOfReviews=10, # Reduced for faster testing - ) - - assert result is not None, "Result is None" - assert hasattr(result, "success"), "Result missing 'success' attribute" - - @pytest.mark.slow - def test_amazon_sellers(self, client): - """ - Test: README - Amazon seller information. - Line: 197-200 - """ - # From README: - # result = client.scrape.amazon.sellers( - # url="https://amazon.com/sp?seller=AXXXXXXXXX" - # ) - - # Using a real seller URL for testing - result = client.scrape.amazon.sellers(url="https://amazon.com/sp?seller=A2L77EE7U53NWQ") - - assert result is not None, "Result is None" - assert hasattr(result, "success"), "Result missing 'success' attribute" - - -class TestPlatformSpecificLinkedIn: - """Test LinkedIn platform-specific examples from README.""" - - @pytest.mark.slow - def test_linkedin_profile_scraping(self, client): - """ - Test: README - LinkedIn profile scraping. - Line: 206-209 - """ - # From README: - # result = client.scrape.linkedin.profiles( - # url="https://linkedin.com/in/johndoe" - # ) - - result = client.scrape.linkedin.profiles(url="https://linkedin.com/in/williamhgates") - - assert result is not None, "Result is None" - assert hasattr(result, "success"), "Result missing 'success' attribute" - - @pytest.mark.slow - def test_linkedin_jobs_scrape(self, client): - """ - Test: README - LinkedIn job scraping by URL. - Line: 211-213 - """ - # From README: - # result = client.scrape.linkedin.jobs( - # url="https://linkedin.com/jobs/view/123456" - # ) - - # Using a real job URL for testing - result = client.scrape.linkedin.jobs(url="https://linkedin.com/jobs/view/3000000000") - - assert result is not None, "Result is None" - assert hasattr(result, "success"), "Result missing 'success' attribute" - - @pytest.mark.slow - def test_linkedin_companies(self, client): - """ - Test: README - LinkedIn company scraping. - Line: 215-217 - """ - # From README: - # result = client.scrape.linkedin.companies( - # url="https://linkedin.com/company/microsoft" - # ) - - result = client.scrape.linkedin.companies(url="https://linkedin.com/company/microsoft") - - assert result is not None, "Result is None" - assert hasattr(result, "success"), "Result missing 'success' attribute" - - @pytest.mark.slow - def test_linkedin_job_search(self, client): - """ - Test: README - LinkedIn job search/discovery. - Line: 224-229 - """ - # From README: - # result = client.search.linkedin.jobs( - # keyword="python developer", - # location="New York", - # remote=True, - # experienceLevel="mid" - # ) - - result = client.search.linkedin.jobs( - keyword="python developer", location="New York", remote=True, experienceLevel="mid" - ) - - assert result is not None, "Result is None" - assert hasattr(result, "success"), "Result missing 'success' attribute" - - @pytest.mark.slow - def test_linkedin_profile_search(self, client): - """ - Test: README - LinkedIn profile search. - Line: 231-234 - """ - # From README: - # result = client.search.linkedin.profiles( - # firstName="John", - # lastName="Doe" - # ) - - result = client.search.linkedin.profiles(firstName="Bill", lastName="Gates") - - assert result is not None, "Result is None" - assert hasattr(result, "success"), "Result missing 'success' attribute" - - -class TestPlatformSpecificChatGPT: - """Test ChatGPT platform-specific examples from README.""" - - @pytest.mark.slow - def test_chatgpt_single_prompt(self, client): - """ - Test: README - ChatGPT single prompt. - Line: 246-251 - """ - # From README: - # result = client.scrape.chatgpt.prompt( - # prompt="Explain Python async programming", - # country="us", - # web_search=True - # ) - - result = client.scrape.chatgpt.prompt( - prompt="Explain Python async programming", country="us", web_search=True - ) - - assert result is not None, "Result is None" - assert hasattr(result, "success"), "Result missing 'success' attribute" - - @pytest.mark.slow - def test_chatgpt_batch_prompts(self, client): - """ - Test: README - ChatGPT batch prompts. - Line: 253-257 - """ - # From README: - # result = client.scrape.chatgpt.prompts( - # prompts=["What is Python?", "What is JavaScript?", "Compare them"], - # web_searches=[False, False, True] - # ) - - result = client.scrape.chatgpt.prompts( - prompts=["What is Python?", "What is JavaScript?"], web_searches=[False, False] - ) - - assert result is not None, "Result is None" - assert hasattr(result, "success"), "Result missing 'success' attribute" - - -class TestPlatformSpecificFacebook: - """Test Facebook platform-specific examples from README.""" - - @pytest.mark.slow - def test_facebook_posts_by_profile(self, client): - """ - Test: README - Facebook posts from profile. - Line: 263-270 - """ - # From README: - # result = client.scrape.facebook.posts_by_profile( - # url="https://facebook.com/profile", - # num_of_posts=10, - # start_date="01-01-2025", - # end_date="12-31-2025", - # timeout=240 - # ) - - result = client.scrape.facebook.posts_by_profile( - url="https://facebook.com/zuck", - num_of_posts=5, - start_date="01-01-2025", - end_date="12-31-2025", - timeout=240, - ) - - assert result is not None, "Result is None" - assert hasattr(result, "success"), "Result missing 'success' attribute" - - @pytest.mark.slow - def test_facebook_posts_by_group(self, client): - """ - Test: README - Facebook posts from group. - Line: 272-277 - """ - # From README: - # result = client.scrape.facebook.posts_by_group( - # url="https://facebook.com/groups/example", - # num_of_posts=20, - # timeout=240 - # ) - - result = client.scrape.facebook.posts_by_group( - url="https://facebook.com/groups/programming", num_of_posts=5, timeout=240 - ) - - assert result is not None, "Result is None" - assert hasattr(result, "success"), "Result missing 'success' attribute" - - -class TestPlatformSpecificInstagram: - """Test Instagram platform-specific examples from README.""" - - @pytest.mark.slow - def test_instagram_profile_scraping(self, client): - """ - Test: README - Instagram profile scraping. - Line: 305-309 - """ - # From README: - # result = client.scrape.instagram.profiles( - # url="https://instagram.com/username", - # timeout=240 - # ) - - result = client.scrape.instagram.profiles( - url="https://instagram.com/instagram", timeout=240 - ) - - assert result is not None, "Result is None" - assert hasattr(result, "success"), "Result missing 'success' attribute" - - @pytest.mark.slow - def test_instagram_post_scraping(self, client): - """ - Test: README - Instagram specific post scraping. - Line: 311-315 - """ - # From README: - # result = client.scrape.instagram.posts( - # url="https://instagram.com/p/ABC123", - # timeout=240 - # ) - - result = client.scrape.instagram.posts( - url="https://instagram.com/p/C0000000000", timeout=240 - ) - - assert result is not None, "Result is None" - assert hasattr(result, "success"), "Result missing 'success' attribute" - - @pytest.mark.slow - def test_instagram_post_discovery(self, client): - """ - Test: README - Instagram post discovery with filters. - Line: 329-337 - """ - # From README: - # result = client.search.instagram.posts( - # url="https://instagram.com/username", - # num_of_posts=10, - # start_date="01-01-2025", - # end_date="12-31-2025", - # post_type="reel", - # timeout=240 - # ) - - result = client.search.instagram.posts( - url="https://instagram.com/instagram", - num_of_posts=5, - start_date="01-01-2025", - end_date="12-31-2025", - post_type="reel", - timeout=240, - ) - - assert result is not None, "Result is None" - assert hasattr(result, "success"), "Result missing 'success' attribute" - - -class TestSERPAPI: - """Test SERP API examples from README.""" - - def test_google_search(self, client): - """ - Test: README - Google search. - Line: 352-358 - """ - # From README: - # result = client.search.google( - # query="python tutorial", - # location="United States", - # language="en", - # num_results=20 - # ) - - result = client.search.google( - query="python tutorial", location="United States", language="en", num_results=10 - ) - - assert result is not None, "Result is None" - assert hasattr(result, "success"), "Result missing 'success' attribute" - assert hasattr(result, "data"), "Result missing 'data' attribute" - - # From README: for item in result.data: - if result.success and result.data: - for item in result.data[:3]: - # Items should have position, title, or url - assert isinstance(item, dict), "Search result items should be dicts" - - def test_bing_search(self, client): - """ - Test: README - Bing search. - Line: 365-369 - """ - # From README: - # result = client.search.bing( - # query="python tutorial", - # location="United States" - # ) - - result = client.search.bing(query="python tutorial", location="United States") - - assert result is not None, "Result is None" - assert hasattr(result, "success"), "Result missing 'success' attribute" - - def test_yandex_search(self, client): - """ - Test: README - Yandex search. - Line: 371-375 - """ - # From README: - # result = client.search.yandex( - # query="python tutorial", - # location="Russia" - # ) - - result = client.search.yandex(query="python tutorial", location="Russia") - - assert result is not None, "Result is None" - assert hasattr(result, "success"), "Result missing 'success' attribute" - - -class TestAsyncUsage: - """Test async usage examples from README.""" - - @pytest.mark.asyncio - async def test_async_multiple_urls(self, api_token): - """ - Test: README - Async usage with multiple URLs. - Line: 382-399 - """ - # From README: - # async def scrape_multiple(): - # async with BrightDataClient() as client: - # results = await client.scrape_url([ - # "https://example1.com", - # "https://example2.com", - # "https://example3.com" - # ]) - # for result in results: - # print(f"Success: {result.success}") - - async with BrightDataClient(token=api_token) as client: - results = await client.scrape_url( - ["https://httpbin.org/html", "https://example.com", "https://httpbin.org/json"] - ) - - assert results is not None, "Results is None" - assert isinstance(results, list), "Results should be a list" - assert len(results) == 3, f"Expected 3 results, got {len(results)}" - - for result in results: - assert hasattr(result, "success"), "Result missing 'success' attribute" - - -class TestConnectionTesting: - """Test connection testing examples from README.""" - - @pytest.mark.asyncio - async def test_async_connection_test(self, async_client): - """ - Test: README - Async connection test. - Line: 510-511 - """ - # From README: - # is_valid = await client.test_connection() - - is_valid = await async_client.test_connection() - - assert isinstance(is_valid, bool), "test_connection should return bool" - assert is_valid is True, "Connection test should succeed" - - def test_sync_connection_test(self, client): - """ - Test: README - Sync connection test. - Line: 512 - """ - # From README: - # is_valid = client.test_connection_sync() - - is_valid = client.test_connection_sync() - - assert isinstance(is_valid, bool), "test_connection_sync should return bool" - assert is_valid is True, "Sync connection test should succeed" - - @pytest.mark.asyncio - async def test_get_account_info_async(self, async_client): - """ - Test: README - Get account info async. - Line: 514-519 - """ - # From README: - # info = await client.get_account_info() - # print(f"Zones: {info['zone_count']}") - # print(f"Active zones: {[z['name'] for z in info['zones']]}") - - info = await async_client.get_account_info() - - assert isinstance(info, dict), "Account info should be dict" - assert "zone_count" in info, "Account info missing 'zone_count'" - assert "zones" in info, "Account info missing 'zones'" - - def test_get_account_info_sync(self, client): - """ - Test: README - Get account info sync. - Line: 516 - """ - # From README: - # info = client.get_account_info_sync() - - info = client.get_account_info_sync() - - assert isinstance(info, dict), "Account info should be dict" - assert "zone_count" in info, "Account info missing 'zone_count'" - assert "zones" in info, "Account info missing 'zones'" - - -class TestResultObjects: - """Test result object examples from README.""" - - def test_result_object_attributes(self, client): - """ - Test: README - Result object attributes and methods. - Line: 577-595 - """ - # From README: - # result = client.scrape.amazon.products(url="...") - # result.success, result.data, result.error, result.cost - # result.platform, result.method - # result.elapsed_ms(), result.get_timing_breakdown() - # result.to_dict(), result.to_json(indent=2) - - result = client.scrape_url("https://example.com") - - # Verify all attributes - assert hasattr(result, "success"), "Missing 'success' attribute" - assert hasattr(result, "data"), "Missing 'data' attribute" - assert hasattr(result, "error"), "Missing 'error' attribute" - assert hasattr(result, "cost"), "Missing 'cost' attribute" - assert hasattr(result, "platform"), "Missing 'platform' attribute" - assert hasattr(result, "method"), "Missing 'method' attribute" - - # Verify methods - elapsed = result.elapsed_ms() - assert isinstance(elapsed, (int, float)), "elapsed_ms() should return number" - - timing = result.get_timing_breakdown() - assert isinstance(timing, dict), "get_timing_breakdown() should return dict" - - result_dict = result.to_dict() - assert isinstance(result_dict, dict), "to_dict() should return dict" - - result_json = result.to_json(indent=2) - assert isinstance(result_json, str), "to_json() should return str" - json.loads(result_json) # Verify valid JSON - - -class TestAdvancedUsage: - """Test advanced usage examples from README.""" - - @pytest.mark.slow - def test_sync_method_usage(self, client): - """ - Test: README - Sync method usage. - Line: 826-830 - """ - # From README: - # result = client.scrape.linkedin.profiles( - # url="https://linkedin.com/in/johndoe", - # timeout=300 - # ) - - result = client.scrape.linkedin.profiles( - url="https://linkedin.com/in/williamhgates", timeout=300 - ) - - assert result is not None, "Result is None" - assert hasattr(result, "success"), "Result missing 'success' attribute" - - @pytest.mark.slow - @pytest.mark.asyncio - async def test_async_method_usage(self, api_token): - """ - Test: README - Async method usage. - Line: 832-843 - """ - # From README: - # async def scrape_profiles(): - # async with BrightDataClient() as client: - # result = await client.scrape.linkedin.profiles( - # url="https://linkedin.com/in/johndoe", - # timeout=300 - # ) - - async with BrightDataClient(token=api_token) as client: - result = await client.scrape.linkedin.profiles( - url="https://linkedin.com/in/williamhgates", timeout=300 - ) - - assert result is not None, "Result is None" - assert hasattr(result, "success"), "Result missing 'success' attribute" - - -class TestCompleteWorkflow: - """Test the complete workflow example from README.""" - - @pytest.mark.slow - def test_complete_workflow_example(self, api_token): - """ - Test: README - Complete Workflow Example. - Line: 1094-1159 - """ - # From README: - # client = BrightDataClient() - # if client.test_connection_sync(): - # info = client.get_account_info_sync() - # product = client.scrape.amazon.products(...) - # jobs = client.search.linkedin.jobs(...) - # search_results = client.search.google(...) - - client = BrightDataClient(token=api_token) - - # Test connection - is_connected = client.test_connection_sync() - assert is_connected is True, "Connection test failed" - - # Get account info - info = client.get_account_info_sync() - assert isinstance(info, dict), "Account info should be dict" - assert "zone_count" in info, "Account info missing 'zone_count'" - - # Scrape Amazon product - product = client.scrape.amazon.products(url="https://amazon.com/dp/B0CRMZHDG8") - assert product is not None, "Amazon product result is None" - assert hasattr(product, "success"), "Product result missing 'success'" - - # Search LinkedIn jobs - jobs = client.search.linkedin.jobs( - keyword="python developer", location="San Francisco", remote=True - ) - assert jobs is not None, "LinkedIn jobs result is None" - assert hasattr(jobs, "success"), "Jobs result missing 'success'" - - # Search Google - search_results = client.search.google( - query="python async tutorial", location="United States", num_results=5 - ) - assert search_results is not None, "Google search result is None" - assert hasattr(search_results, "success"), "Search result missing 'success'" - - -class TestCLIExamples: - """Test CLI usage examples from README.""" - - def test_cli_help_command(self): - """ - Test: README - CLI help command. - Line: 606 - """ - # From README: - # brightdata --help - - result = subprocess.run( - ["brightdata", "--help"], capture_output=True, text=True, timeout=10 - ) - - assert result.returncode == 0, f"CLI help command failed with code {result.returncode}" - assert ( - "brightdata" in result.stdout.lower() or "help" in result.stdout.lower() - ), "Help output should contain expected text" - - @pytest.mark.slow - def test_cli_scrape_amazon_products(self, api_token): - """ - Test: README - CLI scrape Amazon product command. - Line: 608-611 - """ - # From README: - # brightdata scrape amazon products \ - # "https://amazon.com/dp/B0CRMZHDG8" - - env = os.environ.copy() - env["BRIGHTDATA_API_TOKEN"] = api_token - - result = subprocess.run( - ["brightdata", "scrape", "amazon", "products", "https://amazon.com/dp/B0CRMZHDG8"], - capture_output=True, - text=True, - timeout=120, - env=env, - ) - - # CLI should execute without error (exit code 0 or 1) - assert result.returncode in [ - 0, - 1, - ], f"CLI command failed with unexpected code {result.returncode}: {result.stderr}" - - @pytest.mark.slow - def test_cli_search_linkedin_jobs(self, api_token): - """ - Test: README - CLI search LinkedIn jobs command. - Line: 613-618 - """ - # From README: - # brightdata search linkedin jobs \ - # --keyword "python developer" \ - # --location "New York" \ - # --remote \ - # --output-file jobs.json - - env = os.environ.copy() - env["BRIGHTDATA_API_TOKEN"] = api_token - - result = subprocess.run( - [ - "brightdata", - "search", - "linkedin", - "jobs", - "--keyword", - "python developer", - "--location", - "New York", - "--remote", - ], - capture_output=True, - text=True, - timeout=120, - env=env, - ) - - # CLI should execute without error - assert result.returncode in [ - 0, - 1, - ], f"CLI command failed with unexpected code {result.returncode}: {result.stderr}" - - def test_cli_search_google(self, api_token): - """ - Test: README - CLI search Google command. - Line: 620-623 - """ - # From README: - # brightdata search google \ - # "python tutorial" \ - # --location "United States" - - env = os.environ.copy() - env["BRIGHTDATA_API_TOKEN"] = api_token - - result = subprocess.run( - ["brightdata", "search", "google", "python tutorial", "--location", "United States"], - capture_output=True, - text=True, - timeout=60, - env=env, - ) - - # CLI should execute without error - assert result.returncode in [ - 0, - 1, - ], f"CLI command failed with unexpected code {result.returncode}: {result.stderr}" - - def test_cli_scrape_generic(self, api_token): - """ - Test: README - CLI generic web scraping command. - Line: 625-628 - """ - # From README: - # brightdata scrape generic \ - # "https://example.com" \ - # --response-format pretty - - env = os.environ.copy() - env["BRIGHTDATA_API_TOKEN"] = api_token - - result = subprocess.run( - [ - "brightdata", - "scrape", - "generic", - "https://example.com", - "--response-format", - "pretty", - ], - capture_output=True, - text=True, - timeout=60, - env=env, - ) - - # CLI should execute without error - assert result.returncode in [ - 0, - 1, - ], f"CLI command failed with unexpected code {result.returncode}: {result.stderr}" - - -if __name__ == "__main__": - """Run tests with pytest.""" - pytest.main([__file__, "-v", "--tb=short"]) diff --git a/tests/run_all.py b/tests/run_all.py deleted file mode 100644 index 789f7e9..0000000 --- a/tests/run_all.py +++ /dev/null @@ -1,185 +0,0 @@ -#!/usr/bin/env python3 -""" -Comprehensive test runner - validates EVERYTHING -Saves all outputs to probe/ directory for inspection -""" - -import subprocess -import json -from pathlib import Path -from datetime import datetime - -# Create probe directory structure matching tests/ structure -PROBE_DIR = Path("probe") -PROBE_DIR.mkdir(exist_ok=True) -(PROBE_DIR / "unit").mkdir(exist_ok=True) -(PROBE_DIR / "e2e").mkdir(exist_ok=True) -(PROBE_DIR / "integration").mkdir(exist_ok=True) -(PROBE_DIR / "enes").mkdir(exist_ok=True) -(PROBE_DIR / "root").mkdir(exist_ok=True) - -# Test suites to run (matches tests/ directory structure) -test_suites = { - "root_readme": "tests/readme.py", # Root level test - "unit": "tests/unit/", - "e2e": "tests/e2e/", - "integration": "tests/integration/", - "enes": "tests/enes/", -} - -# Linting checks -lint_checks = { - "black": ["black", "--check", "src", "tests"], - "ruff": ["ruff", "check", "src/", "tests/"], -} - -timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") -results = {"timestamp": timestamp, "test_suites": {}, "lint_checks": {}, "summary": {}} - -print("=" * 80) -print("COMPREHENSIVE SDK VALIDATION") -print("=" * 80) -print(f"Timestamp: {timestamp}") -print(f"Output directory: {PROBE_DIR.absolute()}") -print("=" * 80) - -# Run linting checks -print("\n📋 STEP 1: LINTING CHECKS") -print("-" * 80) - -for check_name, command in lint_checks.items(): - print(f"\n{check_name.upper()}:") - result = subprocess.run(command, capture_output=True, text=True, timeout=60) - - output_file = PROBE_DIR / f"{check_name}_{timestamp}.txt" - output_file.write_text(result.stdout + "\n\n" + result.stderr) - - passed = result.returncode == 0 - results["lint_checks"][check_name] = { - "passed": passed, - "output_file": str(output_file), - "return_code": result.returncode, - } - - if passed: - print(" ✅ PASSED") - else: - print(f" ❌ FAILED (exit code {result.returncode})") - print(f" 📁 Output saved to: {output_file.name}") - -# Run test suites -print("\n📋 STEP 2: TEST SUITES") -print("-" * 80) - -total_passed = 0 -total_failed = 0 - -for suite_name, test_path in test_suites.items(): - print(f"\n{suite_name.upper()} TESTS:") - - result = subprocess.run( - ["python", "-m", "pytest", test_path, "-v", "--tb=short"], - capture_output=True, - text=True, - timeout=300, # Increased timeout for readme tests - ) - - # Save to proper subdirectory - if suite_name == "root_readme": - output_file = PROBE_DIR / "root" / f"readme_{timestamp}.txt" - else: - output_file = PROBE_DIR / suite_name / f"all_{timestamp}.txt" - - output_file.write_text(result.stdout + "\n\n" + result.stderr) - - # Parse results - output = result.stdout + result.stderr - - # Extract pass/fail counts - import re - - match = re.search(r"(\d+) passed", output) - passed = int(match.group(1)) if match else 0 - - match = re.search(r"(\d+) failed", output) - failed = int(match.group(1)) if match else 0 - - match = re.search(r"(\d+) skipped", output) - skipped = int(match.group(1)) if match else 0 - - total_passed += passed - total_failed += failed - - results["test_suites"][suite_name] = { - "passed": passed, - "failed": failed, - "skipped": skipped, - "output_file": str(output_file), - "return_code": result.returncode, - } - - status = "✅ PASSED" if failed == 0 else f"❌ {failed} FAILED" - print(f" {status} - {passed} passed, {failed} failed, {skipped} skipped") - print(f" 📁 Output saved to: {output_file.relative_to(Path.cwd())}") - - # Also run individual test files for detailed inspection - if suite_name in ["unit", "e2e", "integration"]: - test_files = Path(test_path).glob("test_*.py") - for test_file in test_files: - individual_result = subprocess.run( - ["python", "-m", "pytest", str(test_file), "-v", "--tb=short"], - capture_output=True, - text=True, - timeout=60, - ) - - # Save individual test outputs - individual_output = PROBE_DIR / suite_name / f"{test_file.stem}_{timestamp}.txt" - individual_output.write_text( - individual_result.stdout + "\n\n" + individual_result.stderr - ) - -# Save summary -summary_file = PROBE_DIR / f"summary_{timestamp}.json" -results["summary"] = { - "total_tests_passed": total_passed, - "total_tests_failed": total_failed, - "all_linting_passed": all(v["passed"] for v in results["lint_checks"].values()), - "all_tests_passed": total_failed == 0, - "overall_status": ( - "PASS" - if (total_failed == 0 and all(v["passed"] for v in results["lint_checks"].values())) - else "FAIL" - ), -} - -summary_file.write_text(json.dumps(results, indent=2)) - -# Final summary -print("\n" + "=" * 80) -print("FINAL VALIDATION SUMMARY") -print("=" * 80) - -print("\n📊 TEST RESULTS:") -for suite, data in results["test_suites"].items(): - print(f" {suite:15} {data['passed']:4} passed, {data['failed']:4} failed") - -print(f"\n TOTAL: {total_passed:4} passed, {total_failed:4} failed") - -print("\n🔍 LINTING:") -for check, data in results["lint_checks"].items(): - status = "✅ PASS" if data["passed"] else "❌ FAIL" - print(f" {check:15} {status}") - -print(f"\n📁 All outputs saved to: {PROBE_DIR.absolute()}") -print(f"📄 Summary: {summary_file.name}") - -print("\n" + "=" * 80) -if results["summary"]["overall_status"] == "PASS": - print("🎉 ALL VALIDATIONS PASSED - SDK IS 100% WORKING") -else: - print("⚠️ SOME VALIDATIONS FAILED - CHECK PROBE OUTPUTS") -print("=" * 80) - -# Exit with appropriate code -exit(0 if results["summary"]["overall_status"] == "PASS" else 1) diff --git a/tests/samples/amazon/product.json b/tests/samples/amazon/product.json deleted file mode 100644 index 7ca848c..0000000 --- a/tests/samples/amazon/product.json +++ /dev/null @@ -1,648 +0,0 @@ -{ - "title": "STANLEY Quencher H2.0 Tumbler with Handle and Straw 40 oz | Flowstate 3-Position Lid | Cup Holder Compatible for Travel | Insulated Stainless Steel Cup | BPA-Free | Fuchsia", - "seller_name": "FastTrackShop", - "brand": "Stanley 1913", - "description": "Constructed of recycled stainless steel for sustainable sipping, our 40 oz Quencher H2.0 offers maximum hydration with fewer refills. Commuting, studio workouts, day trips or your front porch\u2014you\u2019ll want this tumbler by your side. Thanks to Stanley\u2019s vacuum insulation, your water will stay ice-cold, hour after hour. The advanced FlowState\u2122 lid features a rotating cover with three positions: a straw opening designed to resist splashes while holding the reusable straw in place, a drink opening, and a full-cover top. The ergonomic handle includes comfort-grip inserts for easy carrying, and the narrow base fits just about any car cup holder.", - "initial_price": 44.52, - "currency": "USD", - "availability": "Only 19 left in stock - order soon.", - "reviews_count": 2078, - "categories": [ - "Home & Kitchen", - "Kitchen & Dining", - "Storage & Organization", - "Thermoses", - "Insulated Beverage Containers", - "Tumblers" - ], - "parent_asin": "B0CRMZHDG8", - "asin": "B0CRMZHDG8", - "buybox_seller": "FastTrackShop", - "number_of_sellers": 1, - "root_bs_rank": 12403, - "answered_questions": 0, - "domain": "https://www.amazon.com/", - "images_count": 9, - "url": "https://www.amazon.com/STANLEY-Flowstate-3-Position-Compatible-Insulated/dp/B0CRMZHDG8?th=1&psc=1&language=en_US¤cy=USD", - "video_count": 6, - "image_url": "https://m.media-amazon.com/images/I/61Q4eGZWFSL._AC_SL1500_.jpg", - "item_weight": "1.4 Pounds", - "rating": 4.7, - "product_dimensions": "10\"W x 13.25\"H", - "seller_id": "A62ZX0SLNJGAO", - "image": "https://m.media-amazon.com/images/I/61Q4eGZWFSL._AC_SL1500_.jpg", - "date_first_available": "March 11, 2025", - "model_number": "Stanley Quencher H2.O FlowState\u2122 Tumbler 40 oz Fuchsia", - "manufacturer": "Stanley", - "department": "Home & Kitchen", - "plus_content": true, - "upc": "041604394331", - "video": true, - "top_review": "I love this cup!!! It keeps my drinks cold for so long! The next day it will still have ice in it. It makes me drink more water as well. I love the color! It is my favorite cup.", - "final_price_high": null, - "final_price": 44.52, - "delivery": [ - "FREE delivery Tuesday, November 25", - "Or get FREE delivery . Order within ." - ], - "features": [ - "YOUR DREAM TUMBLER Whichever way your day flows, the H2.0 FlowState tumbler keeps you refreshed with fewer refills. Double wall vacuum insulation means drinks stay cold, iced or hot for hours. Choose between our 14oz, 20oz, 30oz,40oz and 64oz options depending on your hydration needs. The narrow base on all sizes (except 64oz) fits just about any car cup holder, keeping it right by your side.", - "ADVANCED LID CONSTRUCTION Whether you prefer small sips or maximum thirst quenching, Stanley has developed an advanced FlowState lid, featuring a rotating cover with three positions a straw opening designed to resist splashes with a seal that holds the reusable straw in place, a drink opening, and a full-cover top for added leak resistance. We\u2019ve also included an ergonomic, comfort-grip handle, so you can easily carry your ice-cold water to work, meetings, the gym or trips out of town.", - "EARTH-FRIENDLY DURABILITY Constructed of 90% recycled BPA free stainless steel for sustainable sipping, the Stanley Quencher H2.0 has the durability to stand up to a lifetime of use. Eliminate the use of single-use plastic bottles and straws with a travel tumbler built with sustainability in mind.", - "DISHWASHER SAFE Spend less time hunched over the sink and more time doing the things you love. Cleaning your tumbler and lid couldn't be easier, just pop them into the dishwasher. Unlike plastic bottles that retain stains & smells, this metallic beauty comes out pristine", - "LIFETIME WARRANTY Since 1913 we\u2019ve promised to provide rugged, capable gear for food and drink - accessories built to last a lifetime. It\u2019s a promise we still keep. Stanley products purchased from Stanley Resellers come with a lifetime warranty. Rest easy knowing we\u2019ve got your back through it all." - ], - "buybox_prices": { - "final_price": 44.52, - "unit_price": null - }, - "bought_past_month": 300, - "is_available": true, - "root_bs_category": "Kitchen & Dining", - "bs_category": "Insulated Tumblers", - "bs_rank": 106, - "badge": null, - "subcategory_rank": [ - { - "subcategory_name": "Insulated Tumblers", - "subcategory_rank": 106 - } - ], - "amazon_choice": false, - "images": [ - "https://m.media-amazon.com/images/I/61Q4eGZWFSL._AC_SL1500_.jpg", - "https://m.media-amazon.com/images/I/51bcm0wT+ML._AC_SL1500_.jpg", - "https://m.media-amazon.com/images/I/419lkeRtRxL._AC_SL1500_.jpg", - "https://m.media-amazon.com/images/I/713M33yoSlL._AC_SL1500_.jpg", - "https://m.media-amazon.com/images/I/71I9Aj+yxzL._AC_SL1500_.jpg", - "https://m.media-amazon.com/images/I/51YC0JfYF+L._AC_SL1500_.jpg", - "https://m.media-amazon.com/images/I/614gBtVIEuL._AC_SL1500_.jpg", - "https://m.media-amazon.com/images/I/61B+Xczl9dL._AC_SL1500_.jpg", - "https://m.media-amazon.com/images/I/81-s3dUib0L._AC_SL1500_.jpg" - ], - "product_details": [ - { - "type": "Brand", - "value": "Stanley 1913" - }, - { - "type": "Color", - "value": "Fuchsia" - }, - { - "type": "Special Feature", - "value": "Rotating" - }, - { - "type": "Style", - "value": "40 oz" - }, - { - "type": "Theme", - "value": "Floral" - }, - { - "type": "Recommended Uses For Product", - "value": "Travel" - }, - { - "type": "Included Components", - "value": "Lid, Straw" - }, - { - "type": "Shape", - "value": "Round" - }, - { - "type": "Pattern", - "value": "Solid" - }, - { - "type": "Product Care Instructions", - "value": "Hand Wash Only" - }, - { - "type": "Age Range (Description)", - "value": "Adult" - }, - { - "type": "Material Feature", - "value": "Insulated" - }, - { - "type": "Reusability", - "value": "Reusable" - }, - { - "type": "Unit Count", - "value": "1.0 Count" - }, - { - "type": "Item Weight", - "value": "1.4 Pounds" - }, - { - "type": "Product Dimensions", - "value": "10\"W x 13.25\"H" - }, - { - "type": "Number of Items", - "value": "1" - }, - { - "type": "Pattern", - "value": "Solid" - }, - { - "type": "Manufacturer", - "value": "Stanley" - }, - { - "type": "UPC", - "value": "041604394331" - }, - { - "type": "Size", - "value": "40 Ounces" - }, - { - "type": "Item Package Dimensions L x W x H", - "value": "11.18 x 10.28 x 6.54 inches" - }, - { - "type": "Package Weight", - "value": "0.66 Kilograms" - }, - { - "type": "Item DimensionsLxWxH", - "value": "10 x 10 x 13.25 inches" - }, - { - "type": "Brand Name", - "value": "Stanley 1913" - }, - { - "type": "Warranty Description", - "value": "Lifetime Warranty" - }, - { - "type": "Model Name", - "value": "Stanley Quencher H2.O FlowState\u2122 Tumbler 40 oz Fuchsia" - }, - { - "type": "Suggested Users", - "value": "Unisex-Adult" - }, - { - "type": "Part Number", - "value": "10-11824-062" - }, - { - "type": "ASIN", - "value": "B0CRMZHDG8" - }, - { - "type": "Customer Reviews", - "value": "4.74.7 out of 5 stars2,078 ratings4.7 out of 5 stars" - }, - { - "type": "Best Sellers Rank", - "value": "#12,403 in Kitchen & Dining (See Top 100 in Kitchen & Dining)#106 in Insulated Tumblers" - }, - { - "type": "Date First Available", - "value": "March 11, 2025" - }, - { - "type": "Brand", - "value": "Stanley 1913" - }, - { - "type": "Color", - "value": "Fuchsia" - }, - { - "type": "Special Feature", - "value": "Rotating" - }, - { - "type": "Style", - "value": "40 oz" - }, - { - "type": "Theme", - "value": "Floral" - } - ], - "prices_breakdown": null, - "country_of_origin": null, - "from_the_brand": [ - "https://m.media-amazon.com/images/S/aplus-media-library-service-media/289ef6ac-53e0-442e-8dcd-aef53675a4e3.__CR0,0,2928,1250_PT0_SX1464_V1___.png", - "https://m.media-amazon.com/images/S/aplus-media-library-service-media/744f0625-ad5f-41f8-b924-a79ad364f4eb.__AC_SR166,182___.jpg", - "https://m.media-amazon.com/images/S/aplus-media-library-service-media/65a8b1bf-3d86-4a62-8362-781f4f2f86e0.__AC_SR166,182___.jpg", - "https://m.media-amazon.com/images/S/aplus-media-library-service-media/a06812ed-49d8-4806-b289-a0dd02d88644.__AC_SR166,182___.jpg", - "https://m.media-amazon.com/images/S/aplus-media-library-service-media/a3710e6d-8c2f-42dd-8bef-8d8c3ed6ff16.__AC_SR166,182___.jpg", - "https://m.media-amazon.com/images/S/aplus-media-library-service-media/a0ced0b4-02ee-40c9-aa22-c1c331bf79d8.__AC_SR166,182___.jpg", - "https://m.media-amazon.com/images/S/aplus-media-library-service-media/c91b67b2-821a-47a1-a37b-975a17cc97fa.__AC_SR166,182___.jpg", - "https://m.media-amazon.com/images/S/aplus-media-library-service-media/d971e158-0d75-45af-8e5c-d224d6011ba5.__AC_SR166,182___.jpg", - "https://m.media-amazon.com/images/S/aplus-media-library-service-media/913f6e8d-4a2d-4a4c-b66e-17db0bdc6077.__AC_SR166,182___.jpg", - "https://m.media-amazon.com/images/S/aplus-media-library-service-media/cec8cea2-4c57-4292-b1f1-bcae6c502169.__AC_SR166,182___.jpg", - "https://m.media-amazon.com/images/S/aplus-media-library-service-media/e561aed0-e189-4bfd-b221-6df73df8a2c8.__AC_SR166,182___.jpg", - "https://m.media-amazon.com/images/S/aplus-media-library-service-media/65b19950-5aa0-471f-b549-ef1f1cbad0eb.__AC_SR166,182___.jpg", - "https://m.media-amazon.com/images/S/aplus-media-library-service-media/733259ef-a6b0-43cc-8831-306369db10c9.__AC_SR166,182___.jpg", - "https://m.media-amazon.com/images/S/aplus-media-library-service-media/9386605f-680d-4eee-894a-c567f6fea5bc.__AC_SR166,182___.jpg", - "https://m.media-amazon.com/images/S/aplus-media-library-service-media/166c5288-1843-432d-94f4-5a43675c65d7.__AC_SR166,182___.jpg", - "https://m.media-amazon.com/images/S/aplus-media-library-service-media/ef781bc7-e109-40e7-8016-1fde7ff02976.__AC_SR166,182___.jpg", - "https://m.media-amazon.com/images/S/aplus-media-library-service-media/88613228-dfd6-4b8d-be41-cc1e5b955567.__AC_SR166,182___.jpg", - "https://m.media-amazon.com/images/S/aplus-media-library-service-media/623d0286-4f35-43d4-a315-0614be16aa33.__AC_SR166,182___.jpg", - "https://m.media-amazon.com/images/S/aplus-media-library-service-media/3c222f6e-9b38-45c9-bcd5-3ec7870bc433.__AC_SR166,182___.jpg", - "https://m.media-amazon.com/images/S/aplus-media-library-service-media/775e4c54-25bb-4120-a48e-16c19dce98e4.__AC_SR166,182___.jpg", - "https://m.media-amazon.com/images/S/aplus-media-library-service-media/699a1ab4-7ef0-492f-8c23-97734a455433.__AC_SR166,182___.jpg", - "https://m.media-amazon.com/images/S/aplus-media-library-service-media/f79ddce6-cdb6-4a92-adf7-d407d1a93626.__AC_SR166,182___.jpg", - "https://m.media-amazon.com/images/S/aplus-media-library-service-media/71485eb1-479b-4124-98f8-eca2dc6ff807.__AC_SR166,182___.jpg", - "https://m.media-amazon.com/images/S/aplus-media-library-service-media/9ea36aee-e871-4e92-bcc3-76e6df248051.__AC_SR166,182___.jpg", - "https://m.media-amazon.com/images/S/aplus-media-library-service-media/a4bdf43f-edfc-4a12-a0ad-db7b5b429735.__AC_SR166,182___.jpg", - "https://m.media-amazon.com/images/S/aplus-media-library-service-media/17d96896-3b9a-482c-9695-76f8cab874a2.__AC_SR166,182___.jpg", - "https://m.media-amazon.com/images/S/aplus-media-library-service-media/520f6044-2a05-401d-bd43-6d00946e54fc.__AC_SR166,182___.jpg", - "https://m.media-amazon.com/images/S/aplus-media-library-service-media/db320bf5-4cd1-4814-bea0-13d1f6783220.__AC_SR166,182___.jpg", - "https://m.media-amazon.com/images/S/aplus-media-library-service-media/8b59016a-3915-4bcc-b607-c82b296517cf.__AC_SR166,182___.jpg", - "https://m.media-amazon.com/images/S/aplus-media-library-service-media/86915546-8541-4832-a5b4-1e63e5b6e2c5.__AC_SR166,182___.jpg", - "https://m.media-amazon.com/images/S/aplus-media-library-service-media/e78a6603-4100-4ce6-97cd-d2ef1c5acc67.__AC_SR166,182___.jpg", - "https://m.media-amazon.com/images/S/aplus-media-library-service-media/cbba4633-227b-4bcf-b18c-daa2d8b3c6dc.__AC_SR166,182___.jpg", - "https://m.media-amazon.com/images/S/aplus-media-library-service-media/c86cbbb2-b589-46e9-85b3-cbc690b232f7.__AC_SR166,182___.jpg", - "https://m.media-amazon.com/images/S/aplus-media-library-service-media/9cec6a8f-2302-4952-876d-12c28b100ae5.__AC_SR166,182___.jpg", - "https://m.media-amazon.com/images/S/aplus-media-library-service-media/b09a5a69-15f3-401f-b3e5-d3b5b6238baf.__AC_SR166,182___.jpg", - "https://m.media-amazon.com/images/S/aplus-media-library-service-media/b58fcbd7-651a-49dd-a5e5-aee0f32b7dec.__AC_SR166,182___.jpg", - "https://m.media-amazon.com/images/S/aplus-media-library-service-media/39ce060b-b87a-40e5-b14f-6627aa9bd714.__AC_SR166,182___.jpg", - "https://m.media-amazon.com/images/S/aplus-media-library-service-media/7e253e3c-300c-4d36-ba7e-fee31684789a.__CR0,0,724,906_PT0_SX362_V1___.png" - ], - "product_description": [ - { - "url": "https://m.media-amazon.com/images/S/aplus-media-library-service-media/97534d05-65a9-43c0-a6ec-ab09231b1ea5.__CR0,0,2928,1200_PT0_SX1464_V1___.png", - "type": "image" - }, - { - "url": "https://m.media-amazon.com/images/S/aplus-media-library-service-media/73093095-b4bd-46ac-af17-ee3486cdb1ff.__CR0,0,2928,1200_PT0_SX1464_V1___.png", - "type": "image" - }, - { - "url": "https://m.media-amazon.com/images/S/aplus-media-library-service-media/d91ba5fc-0876-40cb-9d31-aea31266a043.__CR0,0,2928,1200_PT0_SX1464_V1___.png", - "type": "image" - }, - { - "url": "https://m.media-amazon.com/images/S/aplus-media-library-service-media/5ba07b54-05f9-4089-9328-587c2170a4c5.__CR0,0,2928,1200_PT0_SX1464_V1___.png", - "type": "image" - }, - { - "url": "https://m.media-amazon.com/images/S/aplus-media-library-service-media/8247baba-436e-4b64-9e47-61c278440a48.__CR0,0,2928,1200_PT0_SX1464_V1___.png", - "type": "image" - }, - { - "url": "https://m.media-amazon.com/images/S/aplus-media-library-service-media/373de93e-8def-45ec-bf00-2370969f113e.__CR0,0,2928,1200_PT0_SX1464_V1___.png", - "type": "image" - }, - { - "url": "https://m.media-amazon.com/images/S/aplus-media-library-service-media/afa3eb5d-7457-4e00-9b9a-426e1e00fea9.__CR0,0,2928,1200_PT0_SX1464_V1___.png", - "type": "image" - }, - { - "url": "https://m.media-amazon.com/images/S/aplus-media-library-service-media/406cb6fd-9aee-49fe-a79a-1b2aa8700884.__CR0,0,2928,1200_PT0_SX1464_V1___.png", - "type": "image" - }, - { - "url": "https://m.media-amazon.com/images/S/aplus-media-library-service-media/0bb29a89-bc89-4fab-878c-84382958ae15.__CR0,0,2928,1200_PT0_SX1464_V1___.png", - "type": "image" - }, - { - "url": "https://m.media-amazon.com/images/S/aplus-media-library-service-media/af434d84-68b7-4e1b-a06f-8b535a4ac3f4.__CR0,0,2928,1200_PT0_SX1464_V1___.png", - "type": "image" - }, - { - "url": "https://m.media-amazon.com/images/S/aplus-media-library-service-media/53b837d7-b6be-4cc0-9573-7c75722cfe9e.__CR0,0,2928,1200_PT0_SX1464_V1___.png", - "type": "image" - }, - { - "url": "https://m.media-amazon.com/images/S/aplus-media-library-service-media/f6dfa7dd-91d4-4e1e-80d9-2a8b78daf0db.__CR0,0,2928,1200_PT0_SX1464_V1___.png", - "type": "image" - }, - { - "url": "https://m.media-amazon.com/images/S/aplus-media-library-service-media/df69257d-1acd-4ef8-af33-004d0d8258fe.__CR0,0,2928,1200_PT0_SX1464_V1___.png", - "type": "image" - }, - { - "url": "https://m.media-amazon.com/images/S/aplus-media-library-service-media/e593ca03-b2e0-4a03-b74a-1646aa4c361b.__CR0,0,2928,1200_PT0_SX1464_V1___.png", - "type": "image" - }, - { - "url": "https://m.media-amazon.com/images/S/aplus-media-library-service-media/e5830cfa-0097-46bd-9a3e-393cb263e28f.__CR212,74,1319,1484_PT0_SX200_V1___.jpg", - "type": "image" - }, - { - "url": "https://m.media-amazon.com/images/S/aplus-media-library-service-media/e02e448e-65b9-449f-b78d-e7cd7cfeb0be.__CR126,79,1399,1574_PT0_SX200_V1___.jpg", - "type": "image" - }, - { - "url": "https://m.media-amazon.com/images/S/aplus-media-library-service-media/86d06781-86ce-437c-ac86-39ae51b0ede2.__CR146,60,1313,1477_PT0_SX200_V1___.jpg", - "type": "image" - }, - { - "url": "https://m.media-amazon.com/images/S/aplus-media-library-service-media/fca8f770-0b3b-49c5-a86d-4609bbcc29bd.__CR82,78,1379,1552_PT0_SX200_V1___.jpg", - "type": "image" - }, - { - "url": "https://m.media-amazon.com/images/S/aplus-media-library-service-media/6a16ad22-42e0-4d22-938b-119193ee2638.__CR156,78,1390,1564_PT0_SX200_V1___.jpg", - "type": "image" - }, - { - "url": "https://m.media-amazon.com/images/S/aplus-media-library-service-media/59b0fa15-b485-40f4-888a-0d352fed7645.__CR72,0,1413,1590_PT0_SX200_V1___.jpg", - "type": "image" - }, - { - "url": "https://m.media-amazon.com/images/S/aplus-media-library-service-media/e52389cd-649a-43df-a8a1-d99d8625a1b4.__CR0,32,1363,1533_PT0_SX200_V1___.jpg", - "type": "image" - }, - { - "url": "https://m.media-amazon.com/images/I/91S78bzAXzL.png", - "type": "image" - }, - { - "url": "https://m.media-amazon.com/images/S/vse-vms-transcoding-artifact-us-east-1-prod/846952ac-5bb3-4aee-b2e2-4e5c6f0260fd/default.jobtemplate.hls.m3u8", - "type": "video" - } - ], - "seller_url": "https://www.amazon.com/sp?ie=UTF8&seller=A62ZX0SLNJGAO&asin=B0CRMZHDG8", - "customer_says": "Customers praise the tumbler's ability to keep drinks cold all day and maintain temperature consistency, while also appreciating its high-quality construction, vibrant colors, and ice retention that keeps ice for long periods. The cup is durable, with one customer noting it lasts over 24 hours, and customers find it visually appealing and worth the price. The leak-proof feature receives mixed reviews, with some customers reporting it can leak.", - "sustainability_features": null, - "climate_pledge_friendly": false, - "videos": [ - "https://www.amazon.com/vdp/00e6bdd168764c04b4c944ca2303813e", - "https://www.amazon.com/vdp/02ea57c0d6674df78fad4e80312af24b", - "https://www.amazon.com/vdp/0c7aebe9726643698e381e08bceef10c", - "https://www.amazon.com/vdp/06ad9e7830634634ada87eeceafcf9ec", - "https://www.amazon.com/vdp/043823788964478e8c4f2f302cbd5ded", - "https://www.amazon.com/vdp/05aa96f7466242dd93615fd06af24de0" - ], - "other_sellers_prices": [ - { - "price": 44.52, - "price_per_unit": null, - "unit": null, - "delivery": "FREE delivery Tuesday, November 25. Or Prime members get FREE delivery Sunday, November 23. Order within 7 hrs 42 mins. Join Prime", - "seller_name": "FastTrackShop", - "seller_url": "https://www.amazon.com/gp/aag/main?ie=UTF8&seller=A62ZX0SLNJGAO&isAmazonFulfilled=1&asin=B0CRMZHDG8&ref_=olp_merch_name_0", - "seller_rating": 4.5, - "ships_from": "Amazon.com", - "num_of_ratings": 2078 - }, - { - "price": 36.99, - "price_per_unit": null, - "unit": null, - "delivery": "FREE delivery November 25 - 28. Order within 57 mins. Details", - "seller_name": "Delivering Delight", - "seller_url": "https://www.amazon.com/gp/aag/main?ie=UTF8&seller=A7MWBES2H1S9W&isAmazonFulfilled=0&asin=B0CRMZHDG8&ref_=olp_merch_name_1", - "seller_rating": 3.5, - "ships_from": "Delivering Delight", - "num_of_ratings": 666 - }, - { - "price": 43.54, - "price_per_unit": null, - "unit": null, - "delivery": "FREE delivery Tuesday, November 25. Or Prime members get FREE delivery Saturday, November 22. Order within 6 hrs 17 mins. Join Prime", - "seller_name": "Amazon Resale", - "seller_url": "https://www.amazon.com/Warehouse-Deals/b?ie=UTF8&node=10158976011", - "seller_rating": 0, - "ships_from": "Amazon.com", - "num_of_ratings": 0 - }, - { - "price": 44.52, - "price_per_unit": null, - "unit": null, - "delivery": "FREE delivery November 29 - December 3. Or fastest delivery November 29 - 30", - "seller_name": "Boddigan", - "seller_url": "https://www.amazon.com/gp/aag/main?ie=UTF8&seller=A37H64HUL33DH6&isAmazonFulfilled=1&asin=B0CRMZHDG8&ref_=olp_merch_name_3", - "seller_rating": 5, - "ships_from": "Amazon.com", - "num_of_ratings": 423 - }, - { - "price": 44.52, - "price_per_unit": null, - "unit": null, - "delivery": "FREE delivery Tuesday, November 25. Order within 11 hrs 57 mins", - "seller_name": "Instant Outfitters", - "seller_url": "https://www.amazon.com/gp/aag/main?ie=UTF8&seller=A2THUWHK9D7AMP&isAmazonFulfilled=1&asin=B0CRMZHDG8&ref_=olp_merch_name_4", - "seller_rating": 3.5, - "ships_from": "Amazon.com", - "num_of_ratings": 167 - }, - { - "price": 44.74, - "price_per_unit": null, - "unit": null, - "delivery": "FREE delivery Tuesday, November 25. Or Prime members get FREE delivery Saturday, November 22. Order within 11 hrs 57 mins. Join Prime", - "seller_name": "Premier Shipping Fast", - "seller_url": "https://www.amazon.com/gp/aag/main?ie=UTF8&seller=AB1XQ3DA8GGTV&isAmazonFulfilled=1&asin=B0CRMZHDG8&ref_=olp_merch_name_5", - "seller_rating": 5, - "ships_from": "Amazon.com", - "num_of_ratings": 15978 - }, - { - "price": 45, - "price_per_unit": null, - "unit": null, - "delivery": "FREE delivery December 3 - 10. Or fastest delivery December 3 - 7", - "seller_name": "WW Distribution", - "seller_url": "https://www.amazon.com/gp/aag/main?ie=UTF8&seller=A4XGQVD7S67VA&isAmazonFulfilled=1&asin=B0CRMZHDG8&ref_=olp_merch_name_6", - "seller_rating": 5, - "ships_from": "Amazon.com", - "num_of_ratings": 393 - }, - { - "price": 45.2, - "price_per_unit": null, - "unit": null, - "delivery": "FREE delivery Tuesday, November 25. Or Prime members get FREE delivery Sunday, November 23. Order within 6 hrs 7 mins. Join Prime", - "seller_name": "Broheemium", - "seller_url": "https://www.amazon.com/gp/aag/main?ie=UTF8&seller=A9FTCNW4UYFKQ&isAmazonFulfilled=1&asin=B0CRMZHDG8&ref_=olp_merch_name_7", - "seller_rating": 5, - "ships_from": "Amazon.com", - "num_of_ratings": 181 - }, - { - "price": 49.99, - "price_per_unit": null, - "unit": null, - "delivery": "FREE delivery Tuesday, November 25. Or Prime members get FREE delivery Sunday, November 23. Order within 8 hrs 32 mins. Join Prime", - "seller_name": "Fill_Your_Cart28", - "seller_url": "https://www.amazon.com/gp/aag/main?ie=UTF8&seller=A1RCXB1QVB73AE&isAmazonFulfilled=1&asin=B0CRMZHDG8&ref_=olp_merch_name_8", - "seller_rating": 4.5, - "ships_from": "Amazon.com", - "num_of_ratings": 26 - }, - { - "price": 49.99, - "price_per_unit": null, - "unit": null, - "delivery": "FREE delivery Saturday, November 29. Details. Or fastest delivery Tomorrow, November 21. Order within 57 mins. Details", - "seller_name": "TNC Express", - "seller_url": "https://www.amazon.com/gp/aag/main?ie=UTF8&seller=A3RTU2Q9OU5P7P&isAmazonFulfilled=0&asin=B0CRMZHDG8&ref_=olp_merch_name_9", - "seller_rating": 4.5, - "ships_from": "TNC Express", - "num_of_ratings": 48 - }, - { - "price": 52.5, - "price_per_unit": null, - "unit": null, - "delivery": "FREE delivery November 28 - December 4. Or fastest delivery November 28 - 30", - "seller_name": "Precision Distributions", - "seller_url": "https://www.amazon.com/gp/aag/main?ie=UTF8&seller=A2X4RNMCZVNFMR&isAmazonFulfilled=1&asin=B0CRMZHDG8&ref_=olp_merch_name_10", - "seller_rating": 5, - "ships_from": "Amazon.com", - "num_of_ratings": 92 - }, - { - "price": 52.5, - "price_per_unit": null, - "unit": null, - "delivery": "FREE delivery November 28 - December 3. Or fastest delivery November 28 - 29", - "seller_name": "Dazzling Deals L.L.C", - "seller_url": "https://www.amazon.com/gp/aag/main?ie=UTF8&seller=A3STZM2JKZANQS&isAmazonFulfilled=1&asin=B0CRMZHDG8&ref_=olp_merch_name_11", - "seller_rating": 4.5, - "ships_from": "Amazon.com", - "num_of_ratings": 181 - }, - { - "price": 52.52, - "price_per_unit": null, - "unit": null, - "delivery": "FREE delivery Tuesday, November 25. Or Prime members get FREE delivery Sunday, November 23. Order within 8 hrs 27 mins. Join Prime", - "seller_name": "Tonya's Store 87", - "seller_url": "https://www.amazon.com/gp/aag/main?ie=UTF8&seller=A8PEXSLRSCNI2&isAmazonFulfilled=1&asin=B0CRMZHDG8&ref_=olp_merch_name_12", - "seller_rating": 5, - "ships_from": "Amazon.com", - "num_of_ratings": 1542 - }, - { - "price": 52.79, - "price_per_unit": null, - "unit": null, - "delivery": "FREE delivery November 29 - December 2. Or fastest delivery November 29 - 30", - "seller_name": "Voadera", - "seller_url": "https://www.amazon.com/gp/aag/main?ie=UTF8&seller=A3E9SVC4GWPUCM&isAmazonFulfilled=1&asin=B0CRMZHDG8&ref_=olp_merch_name_13", - "seller_rating": 5, - "ships_from": "Amazon.com", - "num_of_ratings": 16603 - }, - { - "price": 53.99, - "price_per_unit": null, - "unit": null, - "delivery": "FREE delivery Wednesday, November 26. Or fastest delivery Monday, November 24", - "seller_name": "TophersTreasures", - "seller_url": "https://www.amazon.com/gp/aag/main?ie=UTF8&seller=A3616504VS9MRR&isAmazonFulfilled=1&asin=B0CRMZHDG8&ref_=olp_merch_name_14", - "seller_rating": 4.5, - "ships_from": "Amazon.com", - "num_of_ratings": 116 - }, - { - "price": 54, - "price_per_unit": null, - "unit": null, - "delivery": "FREE delivery Tuesday, November 25. Or Prime members get FREE delivery Saturday, November 22. Order within 4 hrs 42 mins. Join Prime", - "seller_name": "Beacon North", - "seller_url": "https://www.amazon.com/gp/aag/main?ie=UTF8&seller=ARGTW8RFQ3UFU&isAmazonFulfilled=1&asin=B0CRMZHDG8&ref_=olp_merch_name_15", - "seller_rating": 4.5, - "ships_from": "Amazon.com", - "num_of_ratings": 172 - }, - { - "price": 54, - "price_per_unit": null, - "unit": null, - "delivery": "FREE delivery Tuesday, November 25. Or Prime members get FREE delivery Saturday, November 22. Order within 4 hrs 42 mins. Join Prime", - "seller_name": "Beacon North", - "seller_url": "https://www.amazon.com/gp/aag/main?ie=UTF8&seller=ARGTW8RFQ3UFU&isAmazonFulfilled=1&asin=B0CRMZHDG8&ref_=olp_merch_name_16", - "seller_rating": 4.5, - "ships_from": "Amazon.com", - "num_of_ratings": 172 - }, - { - "price": 54.99, - "price_per_unit": null, - "unit": null, - "delivery": "FREE delivery Tuesday, November 25. Order within 57 mins. Details", - "seller_name": "DesignsbyAng", - "seller_url": "https://www.amazon.com/gp/aag/main?ie=UTF8&seller=A1H4WHLNQXFU29&isAmazonFulfilled=0&asin=B0CRMZHDG8&ref_=olp_merch_name_17", - "seller_rating": 3.5, - "ships_from": "DesignsbyAng", - "num_of_ratings": 138 - }, - { - "price": 60, - "price_per_unit": null, - "unit": null, - "delivery": "FREE delivery November 25 - 28. Details. Or fastest delivery Monday, November 24. Order within 57 mins. Details", - "seller_name": "FireandBrimstone1010", - "seller_url": "https://www.amazon.com/gp/aag/main?ie=UTF8&seller=A2PUD1PWGNXXSG&isAmazonFulfilled=0&asin=B0CRMZHDG8&ref_=olp_merch_name_18", - "seller_rating": 4.5, - "ships_from": "FireandBrimstone1010", - "num_of_ratings": 6 - } - ], - "downloadable_videos": [ - "https://m.media-amazon.com/images/S/vse-vms-transcoding-artifact-us-east-1-prod/b2b4a7be-27dc-4a86-8eb0-1c8229b86302/default.jobtemplate.hls.m3u8", - "https://m.media-amazon.com/images/S/vse-vms-transcoding-artifact-us-east-1-prod/383bd4c7-9bfb-4d91-838e-368f889abf89/default.jobtemplate.hls.m3u8", - "https://m.media-amazon.com/images/S/vse-vms-transcoding-artifact-us-east-1-prod/0d94e621-db7f-49d0-8462-7e52316c7e4c/default.jobtemplate.hls.m3u8", - "https://m.media-amazon.com/images/S/vse-vms-transcoding-artifact-us-east-1-prod/87d91748-528c-49fd-903e-394eb4f75a2e/default.jobtemplate.hls.m3u8", - "https://m.media-amazon.com/images/S/vse-vms-transcoding-artifact-us-east-1-prod/a853ef97-4c6c-43a5-917e-f8a85d819ef6/default.jobtemplate.hls.m3u8", - "https://m.media-amazon.com/images/S/vse-vms-transcoding-artifact-us-east-1-prod/9f6fc459-e9e7-4583-a4a8-9c5db0cfb12d/default.jobtemplate.hls.m3u8" - ], - "editorial_reviews": null, - "about_the_author": null, - "zipcode": "11001", - "coupon": null, - "sponsered": true, - "store_url": "https://www.amazon.com/stores/Stanley/page/47A7E765-00AF-4F34-AC01-240A7EDD822A?lp_asin=B0CRMZHDG8&ref_=ast_bln", - "ships_from": "Amazon", - "city": null, - "customers_say": { - "text": "Customers praise the tumbler's ability to keep drinks cold all day and maintain temperature consistency, while also appreciating its high-quality construction, vibrant colors, and ice retention that keeps ice for long periods. The cup is durable, with one customer noting it lasts over 24 hours, and customers find it visually appealing and worth the price. The leak-proof feature receives mixed reviews, with some customers reporting it can leak.", - "keywords": { - "positive": [ - "Drink coldness", - "Quality", - "Color", - "Ice retention", - "Value for money", - "Durability", - "Looks" - ], - "negative": null, - "mixed": [ - "Leak proof" - ] - } - }, - "max_quantity_available": 10, - "variations_values": null, - "language": null, - "return_policy": "FREE refund/replacement until Jan 31, 2026", - "inactive_buy_box": null, - "buybox_seller_rating": null, - "premium_brand": false, - "amazon_prime": true, - "coupon_description": null, - "all_badges": null, - "sponsored": true, - "timestamp": "2025-11-20T17:02:52.814Z", - "input": { - "url": "https://www.amazon.com/dp/B0CRMZHDG8", - "asin": "", - "zipcode": "", - "language": "" - } -} \ No newline at end of file diff --git a/tests/samples/amazon/reviews.json b/tests/samples/amazon/reviews.json deleted file mode 100644 index 9e9b80e..0000000 --- a/tests/samples/amazon/reviews.json +++ /dev/null @@ -1,137 +0,0 @@ -[ - { - "url": "https://www.amazon.com/dp/B0CRMZHDG8", - "product_name": "STANLEY Quencher H2.0 Tumbler with Handle and Straw 40 oz | Flowstate 3-Position Lid | Cup Holder Compatible for Travel | Insulated Stainless Steel Cup | BPA-Free | Fuchsia", - "product_rating": 4.7, - "product_rating_object": { - "one_star": 62, - "two_star": 21, - "three_star": 62, - "four_star": 145, - "five_star": 1787 - }, - "product_rating_max": 5, - "rating": 5, - "author_name": "Lyndsay", - "asin": "B0CRMZHDG8", - "product_rating_count": 2078, - "review_header": "Best cup!", - "review_id": "RCHSV16LEI91Y", - "review_text": "I love this cup!!! It keeps my drinks cold for so long! The next day it will still have ice in it. It makes me drink more water as well. I love the color! It is my favorite cup.", - "author_id": "AHMYL6TFPUNKUILNFWPIO2RZJ24A", - "author_link": "https://www.amazon.com/gp/profile/amzn1.account.AHMYL6TFPUNKUILNFWPIO2RZJ24A/ref=cm_cr_dp_d_gw_tr?ie=UTF8", - "badge": "Verified Purchase", - "brand": "Stanley 1913", - "review_posted_date": "October 20, 2025", - "review_country": "United States", - "helpful_count": 0, - "is_amazon_vine": false, - "is_verified": true, - "variant_asin": null, - "variant_name": null, - "videos": null, - "categories": [ - "Home & Kitchen", - "Kitchen & Dining", - "Storage & Organization", - "Thermoses", - "Insulated Beverage Containers", - "Tumblers" - ], - "department": "Home & Kitchen", - "timestamp": "2025-11-20T17:03:56.058Z", - "input": { - "url": "https://www.amazon.com/dp/B0CRMZHDG8" - } - }, - { - "url": "https://www.amazon.com/dp/B0CRMZHDG8", - "product_name": "STANLEY Quencher H2.0 Tumbler with Handle and Straw 40 oz | Flowstate 3-Position Lid | Cup Holder Compatible for Travel | Insulated Stainless Steel Cup | BPA-Free | Fuchsia", - "product_rating": 4.7, - "product_rating_object": { - "one_star": 62, - "two_star": 21, - "three_star": 62, - "four_star": 145, - "five_star": 1787 - }, - "product_rating_max": 5, - "rating": 5, - "author_name": "LGlover1", - "asin": "B0CRMZHDG8", - "product_rating_count": 2078, - "review_header": "Very pretty and functional cup. Love the color.", - "review_id": "RQMKDLY3XTVFJ", - "review_text": "This is the best cup I have ever purchased. The Stanley cups hold the ice for over 24 hours. I love the beautiful pink color and will be purchasing another one. The lid stay secure no leaking excellent product.", - "author_id": "AGNEFTTOE3A47UXPJY5GPVAZYDTA", - "author_link": "https://www.amazon.com/gp/profile/amzn1.account.AGNEFTTOE3A47UXPJY5GPVAZYDTA/ref=cm_cr_dp_d_gw_tr?ie=UTF8", - "badge": "Verified Purchase", - "brand": "Stanley 1913", - "review_posted_date": "July 31, 2025", - "review_country": "United States", - "helpful_count": 0, - "is_amazon_vine": false, - "is_verified": true, - "variant_asin": null, - "variant_name": null, - "videos": null, - "categories": [ - "Home & Kitchen", - "Kitchen & Dining", - "Storage & Organization", - "Thermoses", - "Insulated Beverage Containers", - "Tumblers" - ], - "department": "Home & Kitchen", - "timestamp": "2025-11-20T17:03:56.058Z", - "input": { - "url": "https://www.amazon.com/dp/B0CRMZHDG8" - } - }, - { - "url": "https://www.amazon.com/dp/B0CRMZHDG8", - "product_name": "STANLEY Quencher H2.0 Tumbler with Handle and Straw 40 oz | Flowstate 3-Position Lid | Cup Holder Compatible for Travel | Insulated Stainless Steel Cup | BPA-Free | Fuchsia", - "product_rating": 4.7, - "product_rating_object": { - "one_star": 62, - "two_star": 21, - "three_star": 62, - "four_star": 145, - "five_star": 1787 - }, - "product_rating_max": 5, - "rating": 5, - "author_name": "Rook", - "asin": "B0CRMZHDG8", - "product_rating_count": 2078, - "review_header": "Sale Find & Daily Favorite \u2014 Keeps Drinks Cold All Day", - "review_id": "R2CB9FUQVQKI1Z", - "review_text": "I got this Stanley Quencher H2.0 tumbler for my wife, and she uses it daily\u2014it's been a home run. The insulation is stellar: even with ice inside, her drinks stay cold for hours, just as advertised. The handle and FlowState lid make sipping convenient and spill-resistant. It fits in the cup holder of the car, and bonus: it was on sale when I bought it, so we got great value for how often it's used. Highly recommend for anyone looking for a dependable, stylish, and practical tumbler.", - "author_id": "AEDEEDATISPHBA52BATK5VGRIXNQ", - "author_link": "https://www.amazon.com/gp/profile/amzn1.account.AEDEEDATISPHBA52BATK5VGRIXNQ/ref=cm_cr_dp_d_gw_tr?ie=UTF8", - "badge": "Verified Purchase", - "brand": "Stanley 1913", - "review_posted_date": "August 27, 2025", - "review_country": "United States", - "helpful_count": 0, - "is_amazon_vine": false, - "is_verified": true, - "variant_asin": null, - "variant_name": null, - "videos": null, - "categories": [ - "Home & Kitchen", - "Kitchen & Dining", - "Storage & Organization", - "Thermoses", - "Insulated Beverage Containers", - "Tumblers" - ], - "department": "Home & Kitchen", - "timestamp": "2025-11-20T17:03:56.058Z", - "input": { - "url": "https://www.amazon.com/dp/B0CRMZHDG8" - } - } -] \ No newline at end of file diff --git a/tests/samples/chatgpt/prompt.json b/tests/samples/chatgpt/prompt.json deleted file mode 100644 index 9cb4fa3..0000000 --- a/tests/samples/chatgpt/prompt.json +++ /dev/null @@ -1,35 +0,0 @@ -[ - { - "url": "https://chatgpt.com/?model=gpt-4&q=Explain%20Python%20in%20one%20sentence", - "prompt": "Explain Python in one sentence", - "answer_html": "\n\n\n\nChatGPT\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSkip to content\n
\n
\n
\n
\n
\n
\n\n\n\n\n\n\n\n
\n
\n
\n\n
\n
\n
\n
\n\n\n
\n
\n
\n
\n
\n
\n
\n
\n
\n\n
\n\n\n\n\n\n\n
\n
\n\n
\n
\n
\n
\n
\n
\n
\n
\n\n\n
\n\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
You said:
\n
\n
\n
\n
\n
\n
\n
Explain Python in one sentence
\n
\n
\n
\n
\n
\n
\n\n
\n
\n
\n
\n
\n
\n
ChatGPT said:
\n
\n
\n
\n
\n
\n
\n

Python is a high-level, easy-to-read programming language that lets you write powerful software quickly with clear, expressive code.

\n
\n
\n
\n
\n
\n
\n\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n\n
\n
\n
\n\n
\n
\n
\n
\n
\n\n
\n
\n
\n
\n
\n\n\n
\n

\n
\n

\n
\n
\n
\n
\n
\n
\n
\n\n
\n\n
\n
\n
\n
\n
\n
\n\n
\n\n
\n
\n
\n
\n
\n\n
\n
\n
\n
\n
\n
\n
\n\n\n\n
\n
\n
\n
\n
\n
\n
\n
\n\n\n
\n
\n
\n
\n
ChatGPT can make mistakes. Check important info.
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", - "answer_text": "Python is a high-level, easy-to-read programming language that lets you write powerful software quickly with clear, expressive code.", - "links_attached": null, - "citations": null, - "recommendations": [], - "country": "US", - "is_map": false, - "references": [], - "shopping": [], - "shopping_visible": false, - "index": null, - "answer_text_markdown": "Python is a high-level, easy-to-read programming language that lets you write powerful software quickly with clear, expressive code.", - "web_search_triggered": false, - "additional_prompt": null, - "additional_answer_text": null, - "map": null, - "search_sources": [], - "response_raw": "[{\"p\":\"\",\"o\":\"add\",\"v\":{\"message\":{\"id\":\"743621fa-eb8d-4ab8-9425-d06bb28abbb6\",\"author\":{\"role\":\"system\",\"name\":null,\"metadata\":{}},\"create_time\":null,\"update_time\":null,\"content\":{\"content_type\":\"text\",\"parts\":[\"\"]},\"status\":\"finished_successfully\",\"end_turn\":true,\"weight\":0,\"metadata\":{\"is_visually_hidden_from_conversation\":true,\"model_switcher_deny\":[]},\"recipient\":\"all\",\"channel\":null},\"conversation_id\":\"691f4974-ece8-8330-8906-75b31eccd63a\",\"error\":null},\"c\":0},{\"v\":{\"message\":{\"id\":\"c7607fa3-c93f-4449-863b-a5592d98e2c3\",\"author\":{\"role\":\"user\",\"name\":null,\"metadata\":{}},\"create_time\":1763658097.325,\"update_time\":null,\"content\":{\"content_type\":\"text\",\"parts\":[\"Explain Python in one sentence\"]},\"status\":\"finished_successfully\",\"end_turn\":null,\"weight\":1,\"metadata\":{\"system_hints\":[],\"request_id\":\"ac58d482-927b-4d2b-ae25-d3f6973af414\",\"message_source\":\"instant-query\",\"turn_exchange_id\":\"0bd313d0-3fb0-4b8e-b029-734a99b893ce\",\"timestamp_\":\"absolute\",\"model_switcher_deny\":[]},\"recipient\":\"all\",\"channel\":null},\"conversation_id\":\"691f4974-ece8-8330-8906-75b31eccd63a\",\"error\":null},\"c\":1},{\"type\":\"input_message\",\"input_message\":{\"id\":\"c7607fa3-c93f-4449-863b-a5592d98e2c3\",\"author\":{\"role\":\"user\",\"name\":null,\"metadata\":{}},\"create_time\":1763658097.325,\"update_time\":null,\"content\":{\"content_type\":\"text\",\"parts\":[\"Explain Python in one sentence\"]},\"status\":\"finished_successfully\",\"end_turn\":null,\"weight\":1,\"metadata\":{\"system_hints\":[],\"request_id\":\"ac58d482-927b-4d2b-ae25-d3f6973af414\",\"message_source\":\"instant-query\",\"turn_exchange_id\":\"0bd313d0-3fb0-4b8e-b029-734a99b893ce\",\"useragent\":{\"client_type\":\"web\",\"is_mobile\":false,\"is_mobile_app\":false,\"is_desktop_app\":false,\"is_native_app\":false,\"is_native_app_apple\":false,\"is_mobile_app_ios\":false,\"is_desktop_app_macos\":false,\"is_aura_app_macos\":false,\"is_aura_web\":false,\"is_sora_ios\":false,\"is_agora_ios\":false,\"is_agora_android\":false,\"is_desktop_app_windows\":false,\"is_electron_app\":false,\"is_mobile_app_android\":false,\"is_mobile_web\":false,\"is_mobile_web_ios\":false,\"is_mobile_web_android\":false,\"is_ios\":false,\"is_android\":false,\"is_chatgpt_client\":false,\"is_sora_client\":false,\"is_agora_client\":false,\"is_browserbased_app\":true,\"is_chatgpt_api\":false,\"is_slack\":false,\"is_chatkit_web\":false,\"is_chatkit_synthetic\":false,\"is_kakao_talk\":false,\"app_version\":null,\"build_number\":null,\"user_agent\":\"mozilla/5.0 (windows nt 10.0; win64; x64) applewebkit/537.36 (khtml, like gecko) chrome/142.0.0.0 safari/537.36\",\"app_environment\":null,\"os_version\":null,\"device_model\":null,\"user_client_type\":\"desktop_web\"},\"timestamp_\":\"absolute\",\"paragen_stream_type\":\"default\",\"parent_id\":\"743621fa-eb8d-4ab8-9425-d06bb28abbb6\"},\"recipient\":\"all\",\"channel\":null},\"conversation_id\":\"691f4974-ece8-8330-8906-75b31eccd63a\"},{\"v\":{\"message\":{\"id\":\"c288c4ba-2d36-4349-b9bf-f3b57337e2db\",\"author\":{\"role\":\"assistant\",\"name\":null,\"metadata\":{}},\"create_time\":1763658101.956405,\"update_time\":1763658102.069259,\"content\":{\"content_type\":\"text\",\"parts\":[\"\"]},\"status\":\"in_progress\",\"end_turn\":null,\"weight\":1,\"metadata\":{\"citations\":[],\"content_references\":[],\"request_id\":\"ac58d482-927b-4d2b-ae25-d3f6973af414\",\"message_type\":\"next\",\"model_slug\":\"gpt-5-1\",\"default_model_slug\":\"auto\",\"parent_id\":\"c7607fa3-c93f-4449-863b-a5592d98e2c3\",\"turn_exchange_id\":\"0bd313d0-3fb0-4b8e-b029-734a99b893ce\",\"timestamp_\":\"absolute\",\"model_switcher_deny\":[]},\"recipient\":\"all\",\"channel\":\"final\"},\"conversation_id\":\"691f4974-ece8-8330-8906-75b31eccd63a\",\"error\":null},\"c\":2},{\"type\":\"server_ste_metadata\",\"metadata\":{\"conduit_prewarmed\":false,\"fast_convo\":true,\"warmup_state\":\"cold\",\"is_first_turn\":true,\"model_slug\":\"gpt-5-1\",\"did_auto_switch_to_reasoning\":false,\"auto_switcher_race_winner\":\"autoswitcher\",\"is_autoswitcher_enabled\":true,\"is_search\":null,\"did_prompt_contain_image\":false,\"message_id\":\"c288c4ba-2d36-4349-b9bf-f3b57337e2db\",\"request_id\":\"ac58d482-927b-4d2b-ae25-d3f6973af414\"},\"conversation_id\":\"691f4974-ece8-8330-8906-75b31eccd63a\"},{\"type\":\"message_marker\",\"conversation_id\":\"691f4974-ece8-8330-8906-75b31eccd63a\",\"message_id\":\"c288c4ba-2d36-4349-b9bf-f3b57337e2db\",\"marker\":\"user_visible_token\",\"event\":\"first\"},{\"o\":\"patch\",\"v\":[{\"p\":\"/message/create_time\",\"o\":\"replace\",\"v\":1763658102.077773},{\"p\":\"/message/update_time\",\"o\":\"replace\",\"v\":1763658102.105858},{\"p\":\"/message/content/parts/0\",\"o\":\"append\",\"v\":\"Python is\"}]},{\"v\":[{\"p\":\"/message/create_time\",\"o\":\"replace\",\"v\":1763658102.134441},{\"p\":\"/message/update_time\",\"o\":\"replace\",\"v\":1763658102.195202},{\"p\":\"/message/content/parts/0\",\"o\":\"append\",\"v\":\" a high-\"}]},{\"v\":[{\"p\":\"/message/create_time\",\"o\":\"replace\",\"v\":1763658102.267426},{\"p\":\"/message/update_time\",\"o\":\"replace\",\"v\":1763658102.295866},{\"p\":\"/message/content/parts/0\",\"o\":\"append\",\"v\":\"level, easy\"}]},{\"v\":[{\"p\":\"/message/create_time\",\"o\":\"replace\",\"v\":1763658102.445245},{\"p\":\"/message/update_time\",\"o\":\"replace\",\"v\":1763658102.513697},{\"p\":\"/message/content/parts/0\",\"o\":\"append\",\"v\":\"-to-read programming language\"}]},{\"v\":[{\"p\":\"/message/create_time\",\"o\":\"replace\",\"v\":1763658102.699152},{\"p\":\"/message/update_time\",\"o\":\"replace\",\"v\":1763658102.72003},{\"p\":\"/message/content/parts/0\",\"o\":\"append\",\"v\":\" that lets you write powerful software quickly\"}]},{\"v\":[{\"p\":\"/message/create_time\",\"o\":\"replace\",\"v\":1763658101.956405},{\"p\":\"/message/update_time\",\"o\":\"replace\",\"v\":1763658102.852512},{\"p\":\"/message/content/parts/0\",\"o\":\"append\",\"v\":\" with clear, expressive code.\"},{\"p\":\"/message/status\",\"o\":\"replace\",\"v\":\"finished_successfully\"},{\"p\":\"/message/end_turn\",\"o\":\"replace\",\"v\":true},{\"p\":\"/message/metadata\",\"o\":\"append\",\"v\":{\"is_complete\":true,\"finish_details\":{\"type\":\"stop\",\"stop_tokens\":[200002]},\"sonic_classification_result\":{\"latency_ms\":19.449779065325856,\"simple_search_prob\":0.1281321013316676,\"complex_search_prob\":0.00004177866803718204,\"no_search_prob\":0.8718261200002951,\"search_complexity_decision\":\"no_search\",\"search_decision\":false,\"simple_search_threshold\":0,\"complex_search_threshold\":0.4,\"no_search_threshold\":0.12,\"threshold_order\":[\"no_search\",\"complex\",\"simple\"],\"classifier_config_name\":\"sonic_classifier_3cls_ev3\",\"classifier_config\":{\"model_name\":\"snc-pg-sw-3cls-ev3\",\"renderer_name\":\"harmony_v4.0.15_16k_orion_text_only_no_asr_2k_action\",\"force_disabled_rate\":0,\"force_enabled_rate\":0,\"num_messages\":20,\"only_user_messages\":false,\"remove_memory\":true,\"support_mm\":true,\"n_ctx\":2048,\"max_action_length\":4,\"dynamic_set_max_message_size\":false,\"max_message_tokens\":2000,\"append_base_config\":false,\"no_search_token\":\"1\",\"simple_search_token\":\"7\",\"complex_search_token\":\"5\",\"simple_search_threshold\":0,\"complex_search_threshold\":0.4,\"no_search_threshold\":0.12,\"prefetch_threshold\":null,\"force_search_first_turn_threshold\":0.00001,\"threshold_order\":[\"no_search\",\"complex\",\"simple\"],\"passthrough_tool_calls\":null,\"timeout\":1},\"decision_source\":\"classifier\",\"passthrough_tool_names\":[]}}}]},{\"type\":\"message_stream_complete\",\"conversation_id\":\"691f4974-ece8-8330-8906-75b31eccd63a\"},{\"type\":\"conversation_detail_metadata\",\"banner_info\":null,\"blocked_features\":[],\"model_limits\":[],\"limits_progress\":[{\"feature_name\":\"file_upload\",\"remaining\":3,\"reset_after\":\"2025-11-21T17:01:43.229556+00:00\"}],\"default_model_slug\":\"auto\",\"conversation_id\":\"691f4974-ece8-8330-8906-75b31eccd63a\"}]", - "answer_section_html": "
\n
\n
\n
\n
\n
\n

Python is a high-level, easy-to-read programming language that lets you write powerful software quickly with clear, expressive code.

\n
\n
\n
\n
\n
\n
\n\n
\n
\n
\n
", - "model": "gpt-5-1", - "web_search_query": null, - "timestamp": "2025-11-20T17:01:52.049Z", - "input": { - "url": "https://chatgpt.com/", - "prompt": "Explain Python in one sentence", - "country": "US", - "web_search": false, - "additional_prompt": "" - } - } -] \ No newline at end of file diff --git a/tests/samples/facebook/posts.json b/tests/samples/facebook/posts.json deleted file mode 100644 index 7a6609d..0000000 --- a/tests/samples/facebook/posts.json +++ /dev/null @@ -1,537 +0,0 @@ -[ - { - "url": "https://www.facebook.com/reel/1178168373700071/", - "post_id": "1346166837555333", - "user_url": "https://www.facebook.com/facebook", - "user_username_raw": "Facebook", - "content": "While in Nashville for the #FacebookRoadTrip, we caught up with singer-songwriter Kane Brown on everything from golfing in Scotland to reminiscing about his very first tour. Share your own memories from the road to Kane\u2019s Fan Challenge on Facebook using #RoadTripMemoriesChallenge \ud83e\udd20", - "date_posted": "2025-11-19T20:40:47.000Z", - "hashtags": [ - "facebookroadtrip" - ], - "num_comments": 2093, - "num_shares": 157, - "num_likes_type": { - "type": "Like", - "num": 6356 - }, - "page_name": "Facebook", - "profile_id": "100064860875397", - "page_intro": "Page \u00b7 Internet company", - "page_category": "Internet company", - "page_logo": "https://scontent.fotp3-3.fna.fbcdn.net/v/t39.30808-1/380700650_10162533193146729_2379134611963304810_n.jpg?stp=dst-jpg_s200x200_tt6&_nc_cat=1&ccb=1-7&_nc_sid=2d3e12&_nc_ohc=oDCg5qbKk18Q7kNvwH1omta&_nc_oc=AdnxTQz33y5kwit1v84JwizErq1XqwuCDxD778aUH-QwCwKFInGJ3h36bU8QdgTFIMQ&_nc_zt=24&_nc_ht=scontent.fotp3-3.fna&_nc_gid=oRJx01wii-4dy45Tgx-ryQ&oh=00_AfhOONC6gHhDr0g7o-ddyqw7at6-bHl2iZhb8UWQH_58pA&oe=69250D8E", - "page_external_website": "fb.me/HowToContactFB", - "page_followers": 155000000, - "page_is_verified": true, - "attachments": [ - { - "id": "1178168373700071", - "type": "Video", - "url": "https://scontent.fotp3-3.fna.fbcdn.net/v/t15.5256-10/584976832_871349975319798_4871365287803428825_n.jpg?stp=dst-jpg_p296x100_tt6&_nc_cat=1&ccb=1-7&_nc_sid=d2b52d&_nc_ohc=gjMY8ZReEDoQ7kNvwGa8N-t&_nc_oc=Adl-ppGoZbPqGT487mkOT_ZyctGC7JXlKIS0zlWBTxZngZZPrwUF6rvTHPARo2g1XuY&_nc_zt=23&_nc_ht=scontent.fotp3-3.fna&_nc_gid=-3h60myjfLqRmenlXvzYQg&oh=00_AfixTptTxvM9KohUg9LqBxrnSsWUoThZT5WAvCa9gOylXA&oe=69250419", - "video_length": "60400", - "attachment_url": "https://www.facebook.com/reel/1178168373700071/", - "video_url": "https://video.fotp3-2.fna.fbcdn.net/o1/v/t2/f2/m366/AQMaVXPDlqn-RupvW09GASa3Gn4QKH2Vp_N1bpg0NrK0W5MONdKe4jnNJqLIyU9zoaXhUy7vfnWThFUyzmro_cgEuOYaCpFVcuNiXi_K6_EPnA.mp4?_nc_cat=109&_nc_oc=Adk9XFWEXJB9J4dxN_xZQ6g9L9DT1sDIysvNTKyxpB78y5pWs7wYxpo7-edLigPnfZE&_nc_sid=5e9851&_nc_ht=video.fotp3-2.fna.fbcdn.net&_nc_ohc=YHFzhNGXeSgQ7kNvwERY-wD&efg=eyJ2ZW5jb2RlX3RhZyI6Inhwdl9wcm9ncmVzc2l2ZS5GQUNFQk9PSy4uQzMuNzIwLmRhc2hfaDI2NC1iYXNpYy1nZW4yXzcyMHAiLCJ4cHZfYXNzZXRfaWQiOjE5MTUyMjkyNDkzOTk5MzYsImFzc2V0X2FnZV9kYXlzIjowLCJ2aV91c2VjYXNlX2lkIjoxMDEyMiwiZHVyYXRpb25fcyI6NjAsInVybGdlbl9zb3VyY2UiOiJ3d3cifQ%3D%3D&ccb=17-1&vs=7200846e54bcdebc&_nc_vs=HBksFQIYRWZiX2VwaGVtZXJhbC9CRjQ3QUExRDk3MUU2MDhBNkJGODY1RUQwQUZCMDA4N19tdF8xX3ZpZGVvX2Rhc2hpbml0Lm1wNBUAAsgBEgAVAhhAZmJfcGVybWFuZW50LzA2NEUzQjMwRDVGNDNDOUVFNzI4OENFN0ZFODc0Q0FFX2F1ZGlvX2Rhc2hpbml0Lm1wNBUCAsgBEgAoABgAGwKIB3VzZV9vaWwBMRJwcm9ncmVzc2l2ZV9yZWNpcGUBMRUAACaAspfxgfnmBhUCKAJDMywXQE4zMzMzMzMYGWRhc2hfaDI2NC1iYXNpYy1nZW4yXzcyMHARAHUCZZSeAQA&_nc_gid=-3h60myjfLqRmenlXvzYQg&_nc_zt=28&oh=00_AfimffoprOlAs92pqZdC2KPErVR0HJTFRLSaUoxCdzEL6g&oe=69251CA3&bitrate=1997814&tag=dash_h264-basic-gen2_720p" - } - ], - "post_external_image": null, - "page_url": "https://www.facebook.com/facebook", - "header_image": "https://scontent.fotp3-4.fna.fbcdn.net/v/t39.30808-6/513094825_10164819146606729_8444440187994304660_n.jpg?stp=dst-jpg_s960x960_tt6&_nc_cat=110&ccb=1-7&_nc_sid=cc71e4&_nc_ohc=VsiHP2aGf3MQ7kNvwFTV3XC&_nc_oc=AdkylD-RY8FvW2JntucYN4H7R89r36f2Bd_ogoTze8GT_dAnJbCu-RKxVkl6QfZsw9I&_nc_zt=23&_nc_ht=scontent.fotp3-4.fna&_nc_gid=oRJx01wii-4dy45Tgx-ryQ&oh=00_Afg6j5-JjOxy77BdW2zEv1Zqhw6_y8xb4Z0ee6b8zX22fA&oe=6925133D", - "avatar_image_url": "https://scontent.fotp3-3.fna.fbcdn.net/v/t39.30808-1/380700650_10162533193146729_2379134611963304810_n.jpg?stp=dst-jpg_s200x200_tt6&_nc_cat=1&ccb=1-7&_nc_sid=2d3e12&_nc_ohc=oDCg5qbKk18Q7kNvwH1omta&_nc_oc=AdnxTQz33y5kwit1v84JwizErq1XqwuCDxD778aUH-QwCwKFInGJ3h36bU8QdgTFIMQ&_nc_zt=24&_nc_ht=scontent.fotp3-3.fna&_nc_gid=oRJx01wii-4dy45Tgx-ryQ&oh=00_AfhOONC6gHhDr0g7o-ddyqw7at6-bHl2iZhb8UWQH_58pA&oe=69250D8E", - "profile_handle": "facebook", - "is_sponsored": false, - "shortcode": "1346166837555333", - "video_view_count": 48896, - "likes": 8018, - "post_type": "Reel", - "following": null, - "link_description_text": null, - "count_reactions_type": [ - { - "type": "Like", - "reaction_count": 6356 - }, - { - "type": "Love", - "reaction_count": 1354 - }, - { - "type": "Care", - "reaction_count": 246 - }, - { - "type": "Wow", - "reaction_count": 46 - }, - { - "type": "Haha", - "reaction_count": 8 - }, - { - "type": "Sad", - "reaction_count": 5 - }, - { - "type": "Angry", - "reaction_count": 3 - } - ], - "is_page": true, - "page_phone": null, - "page_email": null, - "page_creation_time": "2007-11-07T00:00:00.000Z", - "page_reviews_score": null, - "page_reviewers_amount": null, - "page_price_range": null, - "about": [ - { - "type": "INFLUENCER CATEGORY", - "value": "Page \u00b7 Internet company", - "link": null - }, - { - "type": "WEBSITE", - "value": "fb.me/HowToContactFB", - "link": "https://fb.me/HowToContactFB" - } - ], - "active_ads_urls": [], - "delegate_page_id": "20531316728", - "privacy_and_legal_info": null, - "timestamp": "2025-11-20T16:55:55.934Z", - "input": { - "url": "https://www.facebook.com/facebook", - "num_of_posts": 5, - "start_date": "", - "end_date": "" - } - }, - { - "url": "https://www.facebook.com/facebook/posts/pfbid02o9kd9bePA6C6EdPHyPEUsKGDeM9QmJ4EPY7BdZnUzJKe9EHDZkkf3AtCNd3ZxeU4l", - "post_id": "1346025967569420", - "user_url": "https://www.facebook.com/facebook", - "user_username_raw": "Facebook", - "content": "Hey, Music City! We\u2019re headed to seven US cities on the #FacebookRoadTrip to bring the Facebook vibes to all our friends IRL. Check out all the fun we had in Nashville and be sure to join us at our *last* stop on the tour in New York City next month!", - "date_posted": "2025-11-19T16:59:27.000Z", - "hashtags": [ - "facebookroadtrip" - ], - "num_comments": 8757, - "num_shares": 573, - "num_likes_type": { - "type": "Like", - "num": 23285 - }, - "page_name": "Facebook", - "profile_id": "100064860875397", - "page_intro": "Page \u00b7 Internet company", - "page_category": "Internet company", - "page_logo": "https://scontent.fotp3-3.fna.fbcdn.net/v/t39.30808-1/380700650_10162533193146729_2379134611963304810_n.jpg?stp=dst-jpg_s200x200_tt6&_nc_cat=1&ccb=1-7&_nc_sid=2d3e12&_nc_ohc=oDCg5qbKk18Q7kNvwH1omta&_nc_oc=AdnxTQz33y5kwit1v84JwizErq1XqwuCDxD778aUH-QwCwKFInGJ3h36bU8QdgTFIMQ&_nc_zt=24&_nc_ht=scontent.fotp3-3.fna&_nc_gid=oRJx01wii-4dy45Tgx-ryQ&oh=00_AfhOONC6gHhDr0g7o-ddyqw7at6-bHl2iZhb8UWQH_58pA&oe=69250D8E", - "page_external_website": "fb.me/HowToContactFB", - "page_followers": 155000000, - "page_is_verified": true, - "attachments": [ - { - "id": "1346022090903141", - "type": "Photo", - "url": "https://scontent.fotp3-3.fna.fbcdn.net/v/t39.30808-6/585669351_1346026050902745_7640051638980346272_n.jpg?_nc_cat=1&ccb=1-7&_nc_sid=f727a1&_nc_ohc=GpHH57YINDsQ7kNvwHYa6Am&_nc_oc=AdkTbnmoGEgm3PNARgBirW9QhrL-v4SxrJVRTM-zv5exYSemUW6CN_UpLonpZfll_iI&_nc_zt=23&_nc_ht=scontent.fotp3-3.fna&_nc_gid=_MKGgoF8MSZeS1IDEI0gtw&oh=00_AfipsOuKQ3ZfBQCUaffMcQI89jYZXEgek3QtAdCuOrhXkw&oe=69252EA5", - "attachment_url": "https://www.facebook.com/photo.php?fbid=1346022090903141&set=a.1272781121560572&type=3", - "video_url": null - }, - { - "id": "1346022140903136", - "type": "Photo", - "url": "https://scontent.fotp3-3.fna.fbcdn.net/v/t39.30808-6/584614169_1346026080902742_8497372545534067199_n.jpg?_nc_cat=1&ccb=1-7&_nc_sid=f727a1&_nc_ohc=rx_aej1IiugQ7kNvwFjW98p&_nc_oc=AdkxB7s6iOJSXyeOjmnGy9y_RSex-qScBAsxd7jQ-zY2Lb6vbMB4RmdOxNv2VK5RGKs&_nc_zt=23&_nc_ht=scontent.fotp3-3.fna&_nc_gid=_MKGgoF8MSZeS1IDEI0gtw&oh=00_AfhctYoo9bNwtA_6Xq7at8Z0K9Lk1EzuycXCOdvGsmubPw&oe=69250B3B", - "attachment_url": "https://www.facebook.com/photo.php?fbid=1346022140903136&set=a.1272781121560572&type=3", - "video_url": null - }, - { - "id": "1346022154236468", - "type": "Photo", - "url": "https://scontent.fotp3-3.fna.fbcdn.net/v/t39.30808-6/585343367_1346026084236075_767938696844464465_n.jpg?_nc_cat=1&ccb=1-7&_nc_sid=f727a1&_nc_ohc=gcgvljl_EHEQ7kNvwH6jsjF&_nc_oc=AdkDcHIJaoW90iO8TdvguiMjpIjgyChIj8ykD4evRmWpU0X9QOoa11sg6cfSPkk2VUs&_nc_zt=23&_nc_ht=scontent.fotp3-3.fna&_nc_gid=_MKGgoF8MSZeS1IDEI0gtw&oh=00_AfhT6Fbu0LyofJXR2ZhNX4mAOwkN_2LPJda4Oy5mAK46zw&oe=69251D3C", - "attachment_url": "https://www.facebook.com/photo.php?fbid=1346022154236468&set=a.1272781121560572&type=3", - "video_url": null - }, - { - "id": "1346022194236464", - "type": "Photo", - "url": "https://scontent.fotp3-3.fna.fbcdn.net/v/t39.30808-6/584731919_1346026097569407_5936004192315395883_n.jpg?_nc_cat=1&ccb=1-7&_nc_sid=f727a1&_nc_ohc=HArPchLOCFIQ7kNvwEqOT25&_nc_oc=AdlpNnL2wTM4iuXkPlFZRFCKoPjJPtJ5rJIOBNCNQjshM-QRRfisFeJgWEThuHDil14&_nc_zt=23&_nc_ht=scontent.fotp3-3.fna&_nc_gid=_MKGgoF8MSZeS1IDEI0gtw&oh=00_AfhwMz0UnQCkilZ4FKtjnXwj-UhdLQPfiLM99t_rwx4kug&oe=69250D3D", - "attachment_url": "https://www.facebook.com/photo.php?fbid=1346022194236464&set=a.1272781121560572&type=3", - "video_url": null - }, - { - "id": "1346022104236473", - "type": "Photo", - "url": "https://scontent.fotp3-3.fna.fbcdn.net/v/t39.30808-6/587247300_1346026057569411_1976402081820657581_n.jpg?_nc_cat=1&ccb=1-7&_nc_sid=f727a1&_nc_ohc=P9XJp6BUEFAQ7kNvwFxdU8-&_nc_oc=AdnAAmB317anVCSGf6SwjCWxoV3AYXf5GE2jauJbNUOMNMnZPYZX8EmBsO-qJcc9CtM&_nc_zt=23&_nc_ht=scontent.fotp3-3.fna&_nc_gid=_MKGgoF8MSZeS1IDEI0gtw&oh=00_AfhYSPTWNC6xMQcTtQl8_YAnlOQHIx8sK-yTWjL0cL3uRQ&oe=692526FC", - "attachment_url": "https://www.facebook.com/photo.php?fbid=1346022104236473&set=a.1272781121560572&type=3", - "video_url": null - } - ], - "post_external_image": null, - "page_url": "https://www.facebook.com/facebook", - "header_image": "https://scontent.fotp3-4.fna.fbcdn.net/v/t39.30808-6/513094825_10164819146606729_8444440187994304660_n.jpg?stp=dst-jpg_s960x960_tt6&_nc_cat=110&ccb=1-7&_nc_sid=cc71e4&_nc_ohc=VsiHP2aGf3MQ7kNvwFTV3XC&_nc_oc=AdkylD-RY8FvW2JntucYN4H7R89r36f2Bd_ogoTze8GT_dAnJbCu-RKxVkl6QfZsw9I&_nc_zt=23&_nc_ht=scontent.fotp3-4.fna&_nc_gid=oRJx01wii-4dy45Tgx-ryQ&oh=00_Afg6j5-JjOxy77BdW2zEv1Zqhw6_y8xb4Z0ee6b8zX22fA&oe=6925133D", - "avatar_image_url": "https://scontent.fotp3-3.fna.fbcdn.net/v/t39.30808-1/380700650_10162533193146729_2379134611963304810_n.jpg?stp=dst-jpg_s200x200_tt6&_nc_cat=1&ccb=1-7&_nc_sid=2d3e12&_nc_ohc=oDCg5qbKk18Q7kNvwH1omta&_nc_oc=AdnxTQz33y5kwit1v84JwizErq1XqwuCDxD778aUH-QwCwKFInGJ3h36bU8QdgTFIMQ&_nc_zt=24&_nc_ht=scontent.fotp3-3.fna&_nc_gid=oRJx01wii-4dy45Tgx-ryQ&oh=00_AfhOONC6gHhDr0g7o-ddyqw7at6-bHl2iZhb8UWQH_58pA&oe=69250D8E", - "profile_handle": "facebook", - "is_sponsored": false, - "shortcode": "1346025967569420", - "likes": 30321, - "post_image": "https://scontent.fotp3-3.fna.fbcdn.net/v/t39.30808-6/585669351_1346026050902745_7640051638980346272_n.jpg?_nc_cat=1&ccb=1-7&_nc_sid=f727a1&_nc_ohc=GpHH57YINDsQ7kNvwHYa6Am&_nc_oc=AdkTbnmoGEgm3PNARgBirW9QhrL-v4SxrJVRTM-zv5exYSemUW6CN_UpLonpZfll_iI&_nc_zt=23&_nc_ht=scontent.fotp3-3.fna&_nc_gid=_MKGgoF8MSZeS1IDEI0gtw&oh=00_AfipsOuKQ3ZfBQCUaffMcQI89jYZXEgek3QtAdCuOrhXkw&oe=69252EA5", - "post_type": "Post", - "following": null, - "link_description_text": null, - "count_reactions_type": [ - { - "type": "Like", - "reaction_count": 23285 - }, - { - "type": "Love", - "reaction_count": 5845 - }, - { - "type": "Care", - "reaction_count": 889 - }, - { - "type": "Wow", - "reaction_count": 229 - }, - { - "type": "Haha", - "reaction_count": 56 - }, - { - "type": "Sad", - "reaction_count": 9 - }, - { - "type": "Angry", - "reaction_count": 8 - } - ], - "is_page": true, - "page_phone": null, - "page_email": null, - "page_creation_time": "2007-11-07T00:00:00.000Z", - "page_reviews_score": null, - "page_reviewers_amount": null, - "page_price_range": null, - "about": [ - { - "type": "INFLUENCER CATEGORY", - "value": "Page \u00b7 Internet company", - "link": null - }, - { - "type": "WEBSITE", - "value": "fb.me/HowToContactFB", - "link": "https://fb.me/HowToContactFB" - } - ], - "active_ads_urls": [], - "delegate_page_id": "20531316728", - "privacy_and_legal_info": null, - "timestamp": "2025-11-20T16:55:55.934Z", - "input": { - "url": "https://www.facebook.com/facebook", - "num_of_posts": 5, - "start_date": "", - "end_date": "" - } - }, - { - "url": "https://www.facebook.com/facebook/posts/pfbid02nHWsd8pxGMmvvEEEyv2JKMCKK9g74F35PceVr7onVQq7dDx9PddoRLw6GndboRCLl", - "post_id": "1345095954329088", - "user_url": "https://www.facebook.com/facebook", - "user_username_raw": "Facebook", - "content": "Put a finger down if you\u2019re currently spiraling after liking your crush\u2019s story\u2026", - "date_posted": "2025-11-18T17:00:00.000Z", - "num_comments": 5303, - "num_shares": 392, - "num_likes_type": { - "type": "Like", - "num": 18443 - }, - "page_name": "Facebook", - "profile_id": "100064860875397", - "page_intro": "Page \u00b7 Internet company", - "page_category": "Internet company", - "page_logo": "https://scontent.fotp3-3.fna.fbcdn.net/v/t39.30808-1/380700650_10162533193146729_2379134611963304810_n.jpg?stp=dst-jpg_s200x200_tt6&_nc_cat=1&ccb=1-7&_nc_sid=2d3e12&_nc_ohc=oDCg5qbKk18Q7kNvwH1omta&_nc_oc=AdnxTQz33y5kwit1v84JwizErq1XqwuCDxD778aUH-QwCwKFInGJ3h36bU8QdgTFIMQ&_nc_zt=24&_nc_ht=scontent.fotp3-3.fna&_nc_gid=oRJx01wii-4dy45Tgx-ryQ&oh=00_AfhOONC6gHhDr0g7o-ddyqw7at6-bHl2iZhb8UWQH_58pA&oe=69250D8E", - "page_external_website": "fb.me/HowToContactFB", - "page_followers": 155000000, - "page_is_verified": true, - "post_external_image": null, - "page_url": "https://www.facebook.com/facebook", - "header_image": "https://scontent.fotp3-4.fna.fbcdn.net/v/t39.30808-6/513094825_10164819146606729_8444440187994304660_n.jpg?stp=dst-jpg_s960x960_tt6&_nc_cat=110&ccb=1-7&_nc_sid=cc71e4&_nc_ohc=VsiHP2aGf3MQ7kNvwFTV3XC&_nc_oc=AdkylD-RY8FvW2JntucYN4H7R89r36f2Bd_ogoTze8GT_dAnJbCu-RKxVkl6QfZsw9I&_nc_zt=23&_nc_ht=scontent.fotp3-4.fna&_nc_gid=oRJx01wii-4dy45Tgx-ryQ&oh=00_Afg6j5-JjOxy77BdW2zEv1Zqhw6_y8xb4Z0ee6b8zX22fA&oe=6925133D", - "avatar_image_url": "https://scontent.fotp3-3.fna.fbcdn.net/v/t39.30808-1/380700650_10162533193146729_2379134611963304810_n.jpg?stp=dst-jpg_s200x200_tt6&_nc_cat=1&ccb=1-7&_nc_sid=2d3e12&_nc_ohc=oDCg5qbKk18Q7kNvwH1omta&_nc_oc=AdnxTQz33y5kwit1v84JwizErq1XqwuCDxD778aUH-QwCwKFInGJ3h36bU8QdgTFIMQ&_nc_zt=24&_nc_ht=scontent.fotp3-3.fna&_nc_gid=oRJx01wii-4dy45Tgx-ryQ&oh=00_AfhOONC6gHhDr0g7o-ddyqw7at6-bHl2iZhb8UWQH_58pA&oe=69250D8E", - "profile_handle": "facebook", - "is_sponsored": false, - "shortcode": "1345095954329088", - "likes": 23613, - "post_type": "Post", - "following": null, - "link_description_text": null, - "count_reactions_type": [ - { - "type": "Like", - "reaction_count": 18443 - }, - { - "type": "Love", - "reaction_count": 3678 - }, - { - "type": "Haha", - "reaction_count": 802 - }, - { - "type": "Care", - "reaction_count": 550 - }, - { - "type": "Wow", - "reaction_count": 85 - }, - { - "type": "Sad", - "reaction_count": 30 - }, - { - "type": "Angry", - "reaction_count": 25 - } - ], - "is_page": true, - "page_phone": null, - "page_email": null, - "page_creation_time": "2007-11-07T00:00:00.000Z", - "page_reviews_score": null, - "page_reviewers_amount": null, - "page_price_range": null, - "about": [ - { - "type": "INFLUENCER CATEGORY", - "value": "Page \u00b7 Internet company", - "link": null - }, - { - "type": "WEBSITE", - "value": "fb.me/HowToContactFB", - "link": "https://fb.me/HowToContactFB" - } - ], - "active_ads_urls": [], - "delegate_page_id": "20531316728", - "privacy_and_legal_info": null, - "timestamp": "2025-11-20T16:55:55.934Z", - "input": { - "url": "https://www.facebook.com/facebook", - "num_of_posts": 5, - "start_date": "", - "end_date": "" - } - }, - { - "url": "https://www.facebook.com/reel/1381683193563154/", - "post_id": "1344308637741153", - "user_url": "https://www.facebook.com/facebook", - "user_username_raw": "Facebook", - "content": "This reel is your urgent reminder that soup szn has arrived \ud83e\udd24\n\nVideo by Essen Paradies", - "date_posted": "2025-11-17T21:59:55.000Z", - "num_comments": 3091, - "num_shares": 2368, - "num_likes_type": { - "type": "Like", - "num": 18297 - }, - "page_name": "Facebook", - "profile_id": "100064860875397", - "page_intro": "Page \u00b7 Internet company", - "page_category": "Internet company", - "page_logo": "https://scontent.fotp3-3.fna.fbcdn.net/v/t39.30808-1/380700650_10162533193146729_2379134611963304810_n.jpg?stp=dst-jpg_s200x200_tt6&_nc_cat=1&ccb=1-7&_nc_sid=2d3e12&_nc_ohc=oDCg5qbKk18Q7kNvwH1omta&_nc_oc=AdnxTQz33y5kwit1v84JwizErq1XqwuCDxD778aUH-QwCwKFInGJ3h36bU8QdgTFIMQ&_nc_zt=24&_nc_ht=scontent.fotp3-3.fna&_nc_gid=oRJx01wii-4dy45Tgx-ryQ&oh=00_AfhOONC6gHhDr0g7o-ddyqw7at6-bHl2iZhb8UWQH_58pA&oe=69250D8E", - "page_external_website": "fb.me/HowToContactFB", - "page_followers": 155000000, - "page_is_verified": true, - "attachments": [ - { - "id": "1381683193563154", - "type": "Video", - "url": "https://scontent.fotp3-4.fna.fbcdn.net/v/t15.5256-10/583966455_33094466116867804_7048232568839350902_n.jpg?stp=dst-jpg_p296x100_tt6&_nc_cat=108&ccb=1-7&_nc_sid=d2b52d&_nc_ohc=lPrKUi3BRIwQ7kNvwGJ3H9R&_nc_oc=AdkQQNfEqT-WjYi-Y2_88OyKeSJLKLB0KgoAq5zfwF592KRG6Vwnbj8xjbp-HylnXcM&_nc_zt=23&_nc_ht=scontent.fotp3-4.fna&_nc_gid=_MKGgoF8MSZeS1IDEI0gtw&oh=00_AfjPtucBTof3JQOP7l8yI9ej1mrEuhUFs-85HoE1mPillw&oe=69251884", - "video_length": "24700", - "attachment_url": "https://www.facebook.com/reel/1381683193563154/", - "video_url": "https://video.fotp3-2.fna.fbcdn.net/o1/v/t2/f2/m366/AQOW0kYCUDer2UeIrz3h4fMr4dfT80_dIwF6WxM6Cru0cYzWYP13O4FE8-0kh3UBV0Iq1X6mGfUxYhADV8hFnKrv-5v5zoF7BhmmyA4tnnsyoA.mp4?_nc_cat=105&_nc_oc=AdnvPq5uGqIhohUE2ZR4lUyI6-amonnjO3IBNPthwJpOqiUMszG9WktmU3LKElFqONc&_nc_sid=5e9851&_nc_ht=video.fotp3-2.fna.fbcdn.net&_nc_ohc=E3a9CqQXQhMQ7kNvwFgFqCF&efg=eyJ2ZW5jb2RlX3RhZyI6Inhwdl9wcm9ncmVzc2l2ZS5GQUNFQk9PSy4uQzMuNzIwLmRhc2hfaDI2NC1iYXNpYy1nZW4yXzcyMHAiLCJ4cHZfYXNzZXRfaWQiOjgyMDg1MjYxNzIyMDMwMiwiYXNzZXRfYWdlX2RheXMiOjMsInZpX3VzZWNhc2VfaWQiOjEwMTIyLCJkdXJhdGlvbl9zIjoyNCwidXJsZ2VuX3NvdXJjZSI6Ind3dyJ9&ccb=17-1&vs=81c995631c3d8b89&_nc_vs=HBksFQIYRWZiX2VwaGVtZXJhbC9EQjRCQjIyMUYwRkQzODg1NTA1MzFEMDUyQ0IzNTZBQl9tdF8xX3ZpZGVvX2Rhc2hpbml0Lm1wNBUAAsgBEgAVAhhAZmJfcGVybWFuZW50L0Y3NDM1NkExQTYzMUJBMzFCMUE3QTY5QzlFRUIyMjlDX2F1ZGlvX2Rhc2hpbml0Lm1wNBUCAsgBEgAoABgAGwKIB3VzZV9vaWwBMRJwcm9ncmVzc2l2ZV9yZWNpcGUBMRUAACacw8zK9KP1AhUCKAJDMywXQDizMzMzMzMYGWRhc2hfaDI2NC1iYXNpYy1nZW4yXzcyMHARAHUCZZSeAQA&_nc_gid=_MKGgoF8MSZeS1IDEI0gtw&_nc_zt=28&oh=00_AfjXRi9v9QrT_Cjm3Cg1-gOcU5fPalkt147GYfZyvoS_rQ&oe=6925116B&bitrate=2751266&tag=dash_h264-basic-gen2_720p" - } - ], - "post_external_image": null, - "page_url": "https://www.facebook.com/facebook", - "header_image": "https://scontent.fotp3-4.fna.fbcdn.net/v/t39.30808-6/513094825_10164819146606729_8444440187994304660_n.jpg?stp=dst-jpg_s960x960_tt6&_nc_cat=110&ccb=1-7&_nc_sid=cc71e4&_nc_ohc=VsiHP2aGf3MQ7kNvwFTV3XC&_nc_oc=AdkylD-RY8FvW2JntucYN4H7R89r36f2Bd_ogoTze8GT_dAnJbCu-RKxVkl6QfZsw9I&_nc_zt=23&_nc_ht=scontent.fotp3-4.fna&_nc_gid=oRJx01wii-4dy45Tgx-ryQ&oh=00_Afg6j5-JjOxy77BdW2zEv1Zqhw6_y8xb4Z0ee6b8zX22fA&oe=6925133D", - "avatar_image_url": "https://scontent.fotp3-3.fna.fbcdn.net/v/t39.30808-1/380700650_10162533193146729_2379134611963304810_n.jpg?stp=dst-jpg_s200x200_tt6&_nc_cat=1&ccb=1-7&_nc_sid=2d3e12&_nc_ohc=oDCg5qbKk18Q7kNvwH1omta&_nc_oc=AdnxTQz33y5kwit1v84JwizErq1XqwuCDxD778aUH-QwCwKFInGJ3h36bU8QdgTFIMQ&_nc_zt=24&_nc_ht=scontent.fotp3-3.fna&_nc_gid=oRJx01wii-4dy45Tgx-ryQ&oh=00_AfhOONC6gHhDr0g7o-ddyqw7at6-bHl2iZhb8UWQH_58pA&oe=69250D8E", - "profile_handle": "facebook", - "is_sponsored": false, - "shortcode": "1344308637741153", - "video_view_count": 1348545, - "likes": 22573, - "post_type": "Reel", - "following": null, - "link_description_text": null, - "count_reactions_type": [ - { - "type": "Like", - "reaction_count": 18297 - }, - { - "type": "Love", - "reaction_count": 3571 - }, - { - "type": "Wow", - "reaction_count": 360 - }, - { - "type": "Care", - "reaction_count": 289 - }, - { - "type": "Haha", - "reaction_count": 34 - }, - { - "type": "Sad", - "reaction_count": 12 - }, - { - "type": "Angry", - "reaction_count": 10 - } - ], - "is_page": true, - "page_phone": null, - "page_email": null, - "page_creation_time": "2007-11-07T00:00:00.000Z", - "page_reviews_score": null, - "page_reviewers_amount": null, - "page_price_range": null, - "about": [ - { - "type": "INFLUENCER CATEGORY", - "value": "Page \u00b7 Internet company", - "link": null - }, - { - "type": "WEBSITE", - "value": "fb.me/HowToContactFB", - "link": "https://fb.me/HowToContactFB" - } - ], - "active_ads_urls": [], - "delegate_page_id": "20531316728", - "privacy_and_legal_info": null, - "timestamp": "2025-11-20T16:55:55.934Z", - "input": { - "url": "https://www.facebook.com/facebook", - "num_of_posts": 5, - "start_date": "", - "end_date": "" - } - }, - { - "url": "https://www.facebook.com/facebook/posts/pfbid0cjvy6GcddaRhymuiwpnXDdvaVyRy7ZzTT5N8zvKJXEGvvTb3bFmKne6H6J8aVYvol", - "post_id": "1344226454416038", - "user_url": "https://www.facebook.com/facebook", - "user_username_raw": "Facebook", - "content": "\u2018Tis the season to ask Meta AI for yummy baking recipes\n\nMade with Meta AI", - "date_posted": "2025-11-17T19:59:56.000Z", - "num_comments": 3456, - "num_shares": 372, - "num_likes_type": { - "type": "Like", - "num": 9601 - }, - "page_name": "Facebook", - "profile_id": "100064860875397", - "page_intro": "Page \u00b7 Internet company", - "page_category": "Internet company", - "page_logo": "https://scontent.fotp3-3.fna.fbcdn.net/v/t39.30808-1/380700650_10162533193146729_2379134611963304810_n.jpg?stp=dst-jpg_s200x200_tt6&_nc_cat=1&ccb=1-7&_nc_sid=2d3e12&_nc_ohc=oDCg5qbKk18Q7kNvwH1omta&_nc_oc=AdnxTQz33y5kwit1v84JwizErq1XqwuCDxD778aUH-QwCwKFInGJ3h36bU8QdgTFIMQ&_nc_zt=24&_nc_ht=scontent.fotp3-3.fna&_nc_gid=oRJx01wii-4dy45Tgx-ryQ&oh=00_AfhOONC6gHhDr0g7o-ddyqw7at6-bHl2iZhb8UWQH_58pA&oe=69250D8E", - "page_external_website": "fb.me/HowToContactFB", - "page_followers": 155000000, - "page_is_verified": true, - "attachments": [ - { - "id": "1344102401095110", - "type": "Photo", - "url": "https://scontent.fotp3-4.fna.fbcdn.net/v/t39.30808-6/583535523_1344102404428443_764020420504838959_n.jpg?stp=dst-jpg_p526x296_tt6&_nc_cat=110&ccb=1-7&_nc_sid=833d8c&_nc_ohc=UwXx_w-yY-4Q7kNvwHI92JR&_nc_oc=AdnIBQD97VrFs4cwsrObV-NB13U0OFu83IukV4n07p9jKd_bGA_GI5OpoufEK8BkeeA&_nc_zt=23&_nc_ht=scontent.fotp3-4.fna&_nc_gid=WHg8XZNKQkBGkIvCnGF3kQ&oh=00_AfgnWX67Rke8m83S2TxFla4c1rJdRxMThFbqBT1O7eGyrg&oe=69250449", - "video_url": null - } - ], - "post_external_image": null, - "page_url": "https://www.facebook.com/facebook", - "header_image": "https://scontent.fotp3-4.fna.fbcdn.net/v/t39.30808-6/513094825_10164819146606729_8444440187994304660_n.jpg?stp=dst-jpg_s960x960_tt6&_nc_cat=110&ccb=1-7&_nc_sid=cc71e4&_nc_ohc=VsiHP2aGf3MQ7kNvwFTV3XC&_nc_oc=AdkylD-RY8FvW2JntucYN4H7R89r36f2Bd_ogoTze8GT_dAnJbCu-RKxVkl6QfZsw9I&_nc_zt=23&_nc_ht=scontent.fotp3-4.fna&_nc_gid=oRJx01wii-4dy45Tgx-ryQ&oh=00_Afg6j5-JjOxy77BdW2zEv1Zqhw6_y8xb4Z0ee6b8zX22fA&oe=6925133D", - "avatar_image_url": "https://scontent.fotp3-3.fna.fbcdn.net/v/t39.30808-1/380700650_10162533193146729_2379134611963304810_n.jpg?stp=dst-jpg_s200x200_tt6&_nc_cat=1&ccb=1-7&_nc_sid=2d3e12&_nc_ohc=oDCg5qbKk18Q7kNvwH1omta&_nc_oc=AdnxTQz33y5kwit1v84JwizErq1XqwuCDxD778aUH-QwCwKFInGJ3h36bU8QdgTFIMQ&_nc_zt=24&_nc_ht=scontent.fotp3-3.fna&_nc_gid=oRJx01wii-4dy45Tgx-ryQ&oh=00_AfhOONC6gHhDr0g7o-ddyqw7at6-bHl2iZhb8UWQH_58pA&oe=69250D8E", - "profile_handle": "facebook", - "is_sponsored": false, - "shortcode": "1344226454416038", - "likes": 12534, - "post_image": "https://scontent.fotp3-4.fna.fbcdn.net/v/t39.30808-6/583535523_1344102404428443_764020420504838959_n.jpg?stp=dst-jpg_p526x296_tt6&_nc_cat=110&ccb=1-7&_nc_sid=833d8c&_nc_ohc=UwXx_w-yY-4Q7kNvwHI92JR&_nc_oc=AdnIBQD97VrFs4cwsrObV-NB13U0OFu83IukV4n07p9jKd_bGA_GI5OpoufEK8BkeeA&_nc_zt=23&_nc_ht=scontent.fotp3-4.fna&_nc_gid=WHg8XZNKQkBGkIvCnGF3kQ&oh=00_AfgnWX67Rke8m83S2TxFla4c1rJdRxMThFbqBT1O7eGyrg&oe=69250449", - "post_type": "Post", - "following": null, - "link_description_text": null, - "count_reactions_type": [ - { - "type": "Like", - "reaction_count": 9601 - }, - { - "type": "Love", - "reaction_count": 2340 - }, - { - "type": "Care", - "reaction_count": 353 - }, - { - "type": "Wow", - "reaction_count": 119 - }, - { - "type": "Haha", - "reaction_count": 92 - }, - { - "type": "Angry", - "reaction_count": 21 - }, - { - "type": "Sad", - "reaction_count": 8 - } - ], - "is_page": true, - "page_phone": null, - "page_email": null, - "page_creation_time": "2007-11-07T00:00:00.000Z", - "page_reviews_score": null, - "page_reviewers_amount": null, - "page_price_range": null, - "about": [ - { - "type": "INFLUENCER CATEGORY", - "value": "Page \u00b7 Internet company", - "link": null - }, - { - "type": "WEBSITE", - "value": "fb.me/HowToContactFB", - "link": "https://fb.me/HowToContactFB" - } - ], - "active_ads_urls": [], - "delegate_page_id": "20531316728", - "privacy_and_legal_info": null, - "timestamp": "2025-11-20T16:55:55.934Z", - "input": { - "url": "https://www.facebook.com/facebook", - "num_of_posts": 5, - "start_date": "", - "end_date": "" - } - } -] \ No newline at end of file diff --git a/tests/samples/instagram/profile.json b/tests/samples/instagram/profile.json deleted file mode 100644 index 9653911..0000000 --- a/tests/samples/instagram/profile.json +++ /dev/null @@ -1,228 +0,0 @@ -{ - "account": "instagram", - "fbid": "17841400039600391", - "id": "25025320", - "followers": 697291572, - "posts_count": 8241, - "is_business_account": false, - "is_professional_account": true, - "is_verified": true, - "avg_engagement": 0.0017, - "external_url": [ - "http://help.instagram.com/" - ], - "biography": "Discover what's new on Instagram \ud83d\udd0e\u2728", - "following": 286, - "posts": [ - { - "caption": "painting by mouth \ud83d\udc44\u2063\n \u2063\nVideo by @millybampainti \u2063\nMusic by @opheliawilde.music", - "comments": 11454, - "datetime": "2025-11-19T17:17:57.000Z", - "id": "3769442339278306374", - "image_url": "https://scontent-fra3-2.cdninstagram.com/v/t51.2885-15/581734031_18681801997001321_1932070576932116056_n.jpg?stp=dst-jpg_e15_fr_p1080x1080_tt6&_nc_ht=scontent-fra3-2.cdninstagram.com&_nc_cat=1&_nc_oc=Q6cZ2QHRNKnpTyp3nOTa90wqCPSVZpi-KuApYBSwHsZkqNswtqlwIfFChTfLlBJQSDbpzdg&_nc_ohc=k_PsIcaWzwwQ7kNvwHXX_2n&_nc_gid=cw6-_j-JhTMw7N2bbykfug&edm=AOQ1c0wBAAAA&ccb=7-5&oh=00_AfgmWiSPCR5EYn-4wzkrQ2eEBQ2hUmY8diOXiN9Ou_izxQ&oe=692528A9&_nc_sid=8b3546", - "likes": 715407, - "content_type": "Video", - "url": "https://www.instagram.com/p/DRPv9YSADxG", - "video_url": "https://scontent-fra3-2.cdninstagram.com/o1/v/t2/f2/m86/AQO-mxfrthrywUTd_aHwYneykT5hR8alV39J6PyTqACz07xSttT0U4IoE1aG1t2hBkcL4MGqeI7jK7_ni3C0K2lxo3aQxC4NUJT_y9U.mp4?_nc_cat=1&_nc_sid=5e9851&_nc_ht=scontent-fra3-2.cdninstagram.com&_nc_ohc=uidhRAIfHwYQ7kNvwHvlvT9&efg=eyJ2ZW5jb2RlX3RhZyI6Inhwdl9wcm9ncmVzc2l2ZS5JTlNUQUdSQU0uQ0xJUFMuQzMuNzIwLmRhc2hfYmFzZWxpbmVfMV92MSIsInhwdl9hc3NldF9pZCI6MTIxNTE0ODgwNzE5MjYxMywiYXNzZXRfYWdlX2RheXMiOjAsInZpX3VzZWNhc2VfaWQiOjEwMDk5LCJkdXJhdGlvbl9zIjoxOSwidXJsZ2VuX3NvdXJjZSI6Ind3dyJ9&ccb=17-1&vs=f5be72bcf5dcb551&_nc_vs=HBksFQIYUmlnX3hwdl9yZWVsc19wZXJtYW5lbnRfc3JfcHJvZC8yQzQ0QjIzOTkxN0FCNkQ2RDJCQkFGRTNCMDcyNkI5RF92aWRlb19kYXNoaW5pdC5tcDQVAALIARIAFQIYOnBhc3N0aHJvdWdoX2V2ZXJzdG9yZS9HS2RkQVNPM05zclNMUHdDQUJERUdGbnY5d1ZSYnN0VEFRQUYVAgLIARIAKAAYABsCiAd1c2Vfb2lsATEScHJvZ3Jlc3NpdmVfcmVjaXBlATEVAAAmyoCEkPzKqAQVAigCQzMsF0AzXbItDlYEGBJkYXNoX2Jhc2VsaW5lXzFfdjERAHX-B2XmnQEA&_nc_gid=cw6-_j-JhTMw7N2bbykfug&_nc_zt=28&oh=00_Afg8MfrPemi42J4kPLjJ3Jpe7mPzrPnSC1DVvRBU9yQy7g&oe=69213410", - "is_pinned": false - }, - { - "caption": "gliding > walking\n\n#InTheMoment\n\nVideo by @jamalsterrett", - "comments": 8159, - "datetime": "2025-11-18T17:05:56.000Z", - "id": "3768712011689532735", - "image_url": "https://scontent-fra3-2.cdninstagram.com/v/t51.2885-15/582427742_18681652075001321_2703457717514777768_n.jpg?stp=dst-jpg_e15_tt6&_nc_ht=scontent-fra3-2.cdninstagram.com&_nc_cat=1&_nc_oc=Q6cZ2QHRNKnpTyp3nOTa90wqCPSVZpi-KuApYBSwHsZkqNswtqlwIfFChTfLlBJQSDbpzdg&_nc_ohc=52N6A1r_1dkQ7kNvwFBj8R7&_nc_gid=cw6-_j-JhTMw7N2bbykfug&edm=AOQ1c0wBAAAA&ccb=7-5&oh=00_AfiU7vFldfFzsRId6VYvtPS3ONiibGG8h7qH8KNDQHEqIg&oe=69251B1F&_nc_sid=8b3546", - "likes": 690701, - "content_type": "Video", - "url": "https://www.instagram.com/p/DRNJ5ttgJ0_", - "video_url": "https://scontent-fra3-2.cdninstagram.com/o1/v/t2/f2/m86/AQOa6KfkDlyBaPlGGwha7TzpmnzwLn9HAxE1P3B0ONs62ps2Fa_g65gKg9MDTe8QL0kv5snagf75btalD48NWFpGuEYWvG-Kw0FDiGg.mp4?_nc_cat=1&_nc_sid=5e9851&_nc_ht=scontent-fra3-2.cdninstagram.com&_nc_ohc=-k-i82foR2EQ7kNvwE9t5pI&efg=eyJ2ZW5jb2RlX3RhZyI6Inhwdl9wcm9ncmVzc2l2ZS5JTlNUQUdSQU0uQ0xJUFMuQzMuNzIwLmRhc2hfYmFzZWxpbmVfMV92MSIsInhwdl9hc3NldF9pZCI6ODYzMzQxMzQyODI0Nzk1LCJhc3NldF9hZ2VfZGF5cyI6MSwidmlfdXNlY2FzZV9pZCI6MTAwOTksImR1cmF0aW9uX3MiOjE1LCJ1cmxnZW5fc291cmNlIjoid3d3In0%3D&ccb=17-1&vs=7242a09d606b124f&_nc_vs=HBksFQIYUmlnX3hwdl9yZWVsc19wZXJtYW5lbnRfc3JfcHJvZC83MjRCQUJCOUMwNDM4NkMzRjhBMzUyOUI4MDIzNDRBMF92aWRlb19kYXNoaW5pdC5tcDQVAALIARIAFQIYOnBhc3N0aHJvdWdoX2V2ZXJzdG9yZS9HQ0FaeENLSE8yUkVHajBFQUNuc20xeWhMeEJfYnN0VEFRQUYVAgLIARIAKAAYABsCiAd1c2Vfb2lsATEScHJvZ3Jlc3NpdmVfcmVjaXBlATEVAAAmtuX4oIrNiAMVAigCQzMsF0AvIcrAgxJvGBJkYXNoX2Jhc2VsaW5lXzFfdjERAHX-B2XmnQEA&_nc_gid=cw6-_j-JhTMw7N2bbykfug&_nc_zt=28&oh=00_AfgPBs1wEI7XT7arXXKYJV5FXv9zGmRh4xQv21XfXIUxWQ&oe=692128BA", - "is_pinned": false - }, - { - "caption": "Fit recap with @mmiriku (Miri) and Ku \ud83d\udd8d\ufe0f\n\nPainting artist and graphic designer Miri created a cartoon character that\u2019s a nod to herself. With short hair and an expressionless face, Ku has become a canvas for showcasing Miri\u2019s weekly outfits. \n\n\u201cFor me, being creative means being free. I\u2019ve always loved fashion and the joy of dressing differently every day. I see outfits as another way to express my art, so this series became a visual diary of that connection.\u201d\n \nVideo by @mmiriku", - "comments": 4324, - "datetime": "2025-11-17T20:12:51.000Z", - "id": "3768080896697163511", - "image_url": "https://scontent-fra3-2.cdninstagram.com/v/t51.2885-15/582240227_18681527452001321_5089760910649723876_n.jpg?stp=dst-jpg_e15_tt6&_nc_ht=scontent-fra3-2.cdninstagram.com&_nc_cat=1&_nc_oc=Q6cZ2QHRNKnpTyp3nOTa90wqCPSVZpi-KuApYBSwHsZkqNswtqlwIfFChTfLlBJQSDbpzdg&_nc_ohc=lf3mHDJM1FIQ7kNvwFwiBJo&_nc_gid=cw6-_j-JhTMw7N2bbykfug&edm=AOQ1c0wBAAAA&ccb=7-5&oh=00_AfiIw40HbVqPA_kHrh8AcTqJskj2DLI8UEcehMQuUPP4pA&oe=69250BA9&_nc_sid=8b3546", - "likes": 255394, - "content_type": "Video", - "url": "https://www.instagram.com/p/DRK6ZyEkd73", - "video_url": "https://scontent-fra3-2.cdninstagram.com/o1/v/t2/f2/m86/AQMx3Jh8WTOH4HE_MIidqORnBTsMQMX-qFGJEvzrw4JkrIhyBc8yjHrTq7KvWR0hcbR9u7mKq4NNk1FRVBL8UssDb6xRaDiP0R0cZsk.mp4?_nc_cat=1&_nc_sid=5e9851&_nc_ht=scontent-fra3-2.cdninstagram.com&_nc_ohc=GfZQxq-q3U0Q7kNvwGNPFVe&efg=eyJ2ZW5jb2RlX3RhZyI6Inhwdl9wcm9ncmVzc2l2ZS5JTlNUQUdSQU0uQ0xJUFMuQzMuNzIwLmRhc2hfYmFzZWxpbmVfMV92MSIsInhwdl9hc3NldF9pZCI6MTEyMzI4MDQ4MzEyOTA1OCwiYXNzZXRfYWdlX2RheXMiOjIsInZpX3VzZWNhc2VfaWQiOjEwMDk5LCJkdXJhdGlvbl9zIjoyNiwidXJsZ2VuX3NvdXJjZSI6Ind3dyJ9&ccb=17-1&vs=840e4d6031c2976a&_nc_vs=HBksFQIYUmlnX3hwdl9yZWVsc19wZXJtYW5lbnRfc3JfcHJvZC9BOTQ2OTczQTRDOTA0QTUzNURFM0MxNDE3MUE1NjlCOV92aWRlb19kYXNoaW5pdC5tcDQVAALIARIAFQIYOnBhc3N0aHJvdWdoX2V2ZXJzdG9yZS9HSlFBd2lLdS03cjAyRUFIQU1LR21qX2l1ZzQ5YnN0VEFRQUYVAgLIARIAKAAYABsCiAd1c2Vfb2lsATEScHJvZ3Jlc3NpdmVfcmVjaXBlATEVAAAmxNvw4sPn_gMVAigCQzMsF0A6XbItDlYEGBJkYXNoX2Jhc2VsaW5lXzFfdjERAHX-B2XmnQEA&_nc_gid=cw6-_j-JhTMw7N2bbykfug&_nc_zt=28&oh=00_Afjn3K-1uGiIWBKdIIa7tbh9kERD1orMq5xugIaJyz5rAQ&oe=69212918", - "is_pinned": false - }, - { - "caption": "Musician @silvanaestradab (Silvana Estrada) finds her roots in family and the timeless sound of her instrument, the cuatro.\u2063\n\u2063\n\u201cWe have to embrace our roots and celebrate and understand that we are in the world because we have so much to give.\u201d \u2063\n\u2063\nHere\u2019s #10Things with Silvana ahead of the @latingrammys (Latin Grammys Awards), where \u201cComo un P\u00e1jaro\u201c was nominated for Best Singer-Songwriter song.\u2063\n\u2063\n1. A moment of silence amid the chaos \ud83e\uddd8\u200d\u2640\ufe0f\u2063\n2. Can we take a second for the fit? \ud83d\udc4f\u2063\n3. When family treasures become good luck charms \ud83e\udd79\u2063\n4. Just a girl and her cuatro \ud83c\udfb6\u2063\n5. A symbol of rebirth \u2728\u2063\n6. Floral on floral \ud83c\udf38\u2063\n7. Music = nostalgia \ud83c\udf0a\u2063\n8. Mirror, mirror on the wall\u2026 \ud83e\udd33\u2063\n9. Celebrating her culture \u2764\ufe0f\u2063\n10. In her element \u2b50", - "comments": 4316, - "datetime": "2025-11-17T17:00:50.000Z", - "id": "3767985117591555557", - "image_url": "https://scontent-fra3-2.cdninstagram.com/v/t51.2885-15/582063811_18681506197001321_6669266777538152909_n.jpg?stp=dst-jpg_e15_tt6&_nc_ht=scontent-fra3-2.cdninstagram.com&_nc_cat=1&_nc_oc=Q6cZ2QHRNKnpTyp3nOTa90wqCPSVZpi-KuApYBSwHsZkqNswtqlwIfFChTfLlBJQSDbpzdg&_nc_ohc=nAoz_5C1IZIQ7kNvwF4o3on&_nc_gid=cw6-_j-JhTMw7N2bbykfug&edm=AOQ1c0wBAAAA&ccb=7-5&oh=00_AfiobQSvgGBWe6129K8fA-_u0z2knvdH9PBxjbA0OHCsLw&oe=6925132F&_nc_sid=8b3546", - "likes": 488444, - "content_type": "Carousel", - "url": "https://www.instagram.com/p/DRKkoA1AM3l", - "video_url": null, - "is_pinned": false - }, - { - "caption": "@vaibhav_sooryavanshi09 (Vaibhav Sooryavanshi) is a cricket legend \u2014 and he\u2019s only 14 years old. \n\nThe all-rounder is the youngest-ever player in the Indian Premier League and is a member of the @rajasthanroyals (Rajasthan Royals). His love for the game started with his dad, who also played cricket and gave Vaibhav his first kit bag at age 5. \n\nSpend a day with Vaibhav at practice, where he shows off his batting and bowling skills and reveals what\u2019s inside his current kit bags. \n\nVaibhav\u2019s advice to other young athletes? \u201cWhatever sport you like, don\u2019t quit playing. If you keep up your hard work, you will get results with time. And you will see your personal improvement in games, too.\u201d", - "comments": 5958, - "datetime": "2025-11-16T04:51:37.000Z", - "id": "3766893314734600553", - "image_url": "https://scontent-fra3-2.cdninstagram.com/v/t51.2885-15/581225632_18681267859001321_7235732305406302514_n.jpg?stp=dst-jpg_e35_p1080x1080_sh0.08_tt6&_nc_ht=scontent-fra3-2.cdninstagram.com&_nc_cat=1&_nc_oc=Q6cZ2QHRNKnpTyp3nOTa90wqCPSVZpi-KuApYBSwHsZkqNswtqlwIfFChTfLlBJQSDbpzdg&_nc_ohc=LeZzC_sZZZgQ7kNvwFFBNma&_nc_gid=cw6-_j-JhTMw7N2bbykfug&edm=AOQ1c0wBAAAA&ccb=7-5&oh=00_AfjbNpoi6JyQCWs6o8knfjASsA0YleVHqGPpnSee1poSTw&oe=69252463&_nc_sid=8b3546", - "likes": 1071751, - "content_type": "Carousel", - "url": "https://www.instagram.com/p/DRGsYMLjLFp", - "video_url": null, - "is_pinned": false - }, - { - "caption": "pens + desk = insane freestyle \ud83e\udd2f\u2063\n \u2063\n#InTheMoment\u2063\n \u2063\nVideo by @lenstrumental", - "comments": 26092, - "datetime": "2025-11-14T17:09:17.000Z", - "id": "3765814711745052414", - "image_url": "https://scontent-fra3-2.cdninstagram.com/v/t51.2885-15/581257672_1531511241429913_2185789193334358353_n.jpg?stp=dst-jpg_e15_tt6&_nc_ht=scontent-fra3-2.cdninstagram.com&_nc_cat=1&_nc_oc=Q6cZ2QHRNKnpTyp3nOTa90wqCPSVZpi-KuApYBSwHsZkqNswtqlwIfFChTfLlBJQSDbpzdg&_nc_ohc=h0-mzsVmVLIQ7kNvwHkaQEY&_nc_gid=cw6-_j-JhTMw7N2bbykfug&edm=AOQ1c0wBAAAA&ccb=7-5&oh=00_AfgRkDglKQ_N5349iRoEvtXNoxvxk6ClqvGleCBE5r_i-Q&oe=69252891&_nc_sid=8b3546", - "likes": 1725560, - "content_type": "Video", - "url": "https://www.instagram.com/p/DRC3Ic3gP7-", - "video_url": "https://scontent-fra3-2.cdninstagram.com/o1/v/t2/f2/m86/AQPWYPpLgOef3yX6pCJIRSEdBSafXU4kA4YnaJEUHkNjsCzODjdG7OFmA24sCKwstz81gvkLxEIImtfDt6GGrL5JNLMMhDzlArUrzrs.mp4?_nc_cat=1&_nc_sid=5e9851&_nc_ht=scontent-fra3-2.cdninstagram.com&_nc_ohc=fBs1JsupTZEQ7kNvwGBG8Ap&efg=eyJ2ZW5jb2RlX3RhZyI6Inhwdl9wcm9ncmVzc2l2ZS5JTlNUQUdSQU0uQ0xJUFMuQzMuNzIwLmRhc2hfYmFzZWxpbmVfMV92MSIsInhwdl9hc3NldF9pZCI6NDQyMzE3NTU2NDU4MzE2NywiYXNzZXRfYWdlX2RheXMiOjUsInZpX3VzZWNhc2VfaWQiOjEwMDk5LCJkdXJhdGlvbl9zIjo1NywidXJsZ2VuX3NvdXJjZSI6Ind3dyJ9&ccb=17-1&_nc_gid=cw6-_j-JhTMw7N2bbykfug&_nc_zt=28&vs=2428629e2ee008d6&_nc_vs=HBksFQIYUmlnX3hwdl9yZWVsc19wZXJtYW5lbnRfc3JfcHJvZC9DQjREMjc5Q0Q3NDA1OUE2QTU0MzM0RUM2NzgyQURCM192aWRlb19kYXNoaW5pdC5tcDQVAALIARIAFQIYOnBhc3N0aHJvdWdoX2V2ZXJzdG9yZS9HSmZPc0NMaEFSZTR5UlVIQUMxcDl3cEJwV2h3YnN0VEFRQUYVAgLIARIAKAAYABsCiAd1c2Vfb2lsATEScHJvZ3Jlc3NpdmVfcmVjaXBlATEVAAAm_oPzhNq22w8VAigCQzMsF0BM2ZmZmZmaGBJkYXNoX2Jhc2VsaW5lXzFfdjERAHX-B2XmnQEA&oh=00_AfgajaomMW0pkd9sc3eDw7DLe3rIQBoKOBRHTc5XVrO3tw&oe=69211A9A", - "is_pinned": false - }, - { - "caption": "Her name is Pink and she\u2019s really glad to meet you \ud83c\udfb6\ud83d\udc8b\u2063\n\u2063\nHere\u2019s #10Things from singer @pinkpantheress (PinkPantheress) as she gives us a behind-the-scenes look at her tour in New York, from a fan meet-and-greet to a sold-out show in Brooklyn. \u2063\n\u2063\n1. PinkPantheress is serving looks \ud83d\udd25\u2063\n2. Hair \u2705 Makeup \u2705 Vibes \u2705\u2063\n3. Fan meet-and-greet video inception \ud83c\udfa5\u2063\n4. \u201cPicture in My Mind\u201d \ud83e\udd1d Poster painting\u2063\n5. Costumes for days \u2764\ufe0f\u2063\n6. Working with the same makeup artist >>>\u2063\n7. Did somebody say set list?? \ud83d\udc40\u2063\n8. \ud83c\udfb6 Hey, ooh, is this illegal? \ud83c\udfb6\u2063\n9. Boxes on boxes of doughnuts \ud83d\ude0b\u2063\n10. SOLD OUT!!! \ud83d\udde3\ufe0f", - "comments": 7969, - "datetime": "2025-11-13T17:06:48.000Z", - "id": "3765089019533235772", - "image_url": "https://scontent-fra3-2.cdninstagram.com/v/t51.2885-15/562944142_18680886919001321_3400881731806163989_n.jpg?stp=dst-jpg_e35_p1080x1080_sh0.08_tt6&_nc_ht=scontent-fra3-2.cdninstagram.com&_nc_cat=1&_nc_oc=Q6cZ2QHRNKnpTyp3nOTa90wqCPSVZpi-KuApYBSwHsZkqNswtqlwIfFChTfLlBJQSDbpzdg&_nc_ohc=VPDlif0yjK0Q7kNvwH9JNRk&_nc_gid=cw6-_j-JhTMw7N2bbykfug&edm=AOQ1c0wBAAAA&ccb=7-5&oh=00_AfhZCRsf9PjJK5za4SJJs-hPKKZqQk8-2TBytdbtV2c6zg&oe=692532AE&_nc_sid=8b3546", - "likes": 622264, - "content_type": "Carousel", - "url": "https://www.instagram.com/p/DRASIPVAJY8", - "video_url": null, - "is_pinned": false - }, - { - "caption": "a wheel is a wheel \ud83e\udd37\n\n#InTheMoment\n\nVideo by @shinverus \nMusic by @teddysphotos", - "comments": 8264, - "datetime": "2025-11-12T20:10:36.000Z", - "id": "3764455947008836411", - "image_url": "https://scontent-fra3-2.cdninstagram.com/v/t51.2885-15/581189459_18680705881001321_5587454374300182126_n.jpg?stp=dst-jpg_e15_fr_p1080x1080_tt6&_nc_ht=scontent-fra3-2.cdninstagram.com&_nc_cat=1&_nc_oc=Q6cZ2QHRNKnpTyp3nOTa90wqCPSVZpi-KuApYBSwHsZkqNswtqlwIfFChTfLlBJQSDbpzdg&_nc_ohc=0NZpT5FhfAEQ7kNvwFUzrIj&_nc_gid=cw6-_j-JhTMw7N2bbykfug&edm=AOQ1c0wBAAAA&ccb=7-5&oh=00_AfgRBeIVGHnLFr6njpX0lP8DAa4FLnjavnbDIwgK32z6hA&oe=69252EF4&_nc_sid=8b3546", - "likes": 704601, - "content_type": "Video", - "url": "https://www.instagram.com/p/DQ-CL0mEYM7", - "video_url": "https://scontent-fra3-2.cdninstagram.com/o1/v/t2/f2/m86/AQP8IVMfGMNpzje_guHjee0ajnV5PjlXsD1fa0aM1m_1FM-_hUR4h_j36jFiHcqur6JBnSTBy-1S3jMr-SD8NFWHjE07mxh3rlRk4uQ.mp4?_nc_cat=1&_nc_sid=5e9851&_nc_ht=scontent-fra3-2.cdninstagram.com&_nc_ohc=7jn8srIfdfsQ7kNvwHBoaXg&efg=eyJ2ZW5jb2RlX3RhZyI6Inhwdl9wcm9ncmVzc2l2ZS5JTlNUQUdSQU0uQ0xJUFMuQzMuNzIwLmRhc2hfYmFzZWxpbmVfMV92MSIsInhwdl9hc3NldF9pZCI6MzExMDQxNjMzMjQ2MDY0OSwiYXNzZXRfYWdlX2RheXMiOjcsInZpX3VzZWNhc2VfaWQiOjEwMDk5LCJkdXJhdGlvbl9zIjoxMywidXJsZ2VuX3NvdXJjZSI6Ind3dyJ9&ccb=17-1&_nc_gid=cw6-_j-JhTMw7N2bbykfug&_nc_zt=28&vs=95768cd10ffa91a5&_nc_vs=HBksFQIYUmlnX3hwdl9yZWVsc19wZXJtYW5lbnRfc3JfcHJvZC9CRTRFNEM4M0Q1Rjc3QjQyQ0YzODJEQTM5QUJCRkJCNV92aWRlb19kYXNoaW5pdC5tcDQVAALIARIAFQIYOnBhc3N0aHJvdWdoX2V2ZXJzdG9yZS9HTFJUcFNKWEU1aDl1Vm9FQUdjVFZtdnNaY0o3YnN0VEFRQUYVAgLIARIAKAAYABsCiAd1c2Vfb2lsATEScHJvZ3Jlc3NpdmVfcmVjaXBlATEVAAAm0snMyYe6hgsVAigCQzMsF0AqAAAAAAAAGBJkYXNoX2Jhc2VsaW5lXzFfdjERAHX-B2XmnQEA&oh=00_AfhBWuJdi1q_McjiiYd34e6l_VpFBviq2S4NPORneBEG6Q&oe=69210D3C", - "is_pinned": false - }, - { - "caption": "@charles_leclerc (Charles Leclerc) and his pup Leo are racing onto your feed \ud83c\udfce\ufe0f\u2063\n\u2063\nThe Formula 1 driver is back home in Monaco, a place where \u201ctime kind of slows down\u201d and brings back his favorite childhood memories, like hearing the engine noises of the Grand Prix while he was in school.\u2063\n\u2063\nLeo is another spot of joy for Charles. \u201cWhether it\u2019s a good day or a bad day, Leo is always happy and that makes a difference for sure.\u201d \ud83d\udc36\u2063\n\u2063\nPhotos and videos by @antoine", - "comments": 9444, - "datetime": "2025-11-12T17:02:25.000Z", - "id": "3764362036281132587", - "image_url": "https://scontent-fra3-2.cdninstagram.com/v/t51.2885-15/571159454_18680672356001321_6067283357652793275_n.jpg?stp=dst-jpg_e35_p1080x1080_sh0.08_tt6&_nc_ht=scontent-fra3-2.cdninstagram.com&_nc_cat=1&_nc_oc=Q6cZ2QHRNKnpTyp3nOTa90wqCPSVZpi-KuApYBSwHsZkqNswtqlwIfFChTfLlBJQSDbpzdg&_nc_ohc=EG0kuyUTXaQQ7kNvwEph4ya&_nc_gid=cw6-_j-JhTMw7N2bbykfug&edm=AOQ1c0wBAAAA&ccb=7-5&oh=00_Afg7CRaf9YC4lNyMWD_coNqJy_jArf90L8IWn4xKBjNXUw&oe=69251403&_nc_sid=8b3546", - "likes": 3557269, - "content_type": "Carousel", - "url": "https://www.instagram.com/p/DQ9s1PagMYr", - "video_url": null, - "is_pinned": false - }, - { - "caption": "if you\u2019re seeing this post, it\u2019s your sign to take a moment of zen \ud83e\uddd8\n\nthis waterfall in Brazil is called Cachoeira da Fuma\u00e7a, or \u201cSmoke Falls\u201d \ud83d\ude2e\ud83d\udca8\n\n#InTheMoment\n\nVideo by @marinavieirasou \nMusic by Johann Debussy", - "comments": 20105, - "datetime": "2025-11-11T21:09:29.000Z", - "id": "3763760604772428066", - "image_url": "https://scontent-fra3-2.cdninstagram.com/v/t51.2885-15/580975272_863975589424209_5954144657975698386_n.jpg?stp=dst-jpg_e15_tt6&_nc_ht=scontent-fra3-2.cdninstagram.com&_nc_cat=1&_nc_oc=Q6cZ2QHRNKnpTyp3nOTa90wqCPSVZpi-KuApYBSwHsZkqNswtqlwIfFChTfLlBJQSDbpzdg&_nc_ohc=K55Nyz9o4AYQ7kNvwGtUDVG&_nc_gid=cw6-_j-JhTMw7N2bbykfug&edm=AOQ1c0wBAAAA&ccb=7-5&oh=00_Afi0xQxhYSth2-JlbsUuxFR6yDS9mVKv-wxYRmV7abp98Q&oe=69250A64&_nc_sid=8b3546", - "likes": 3717926, - "content_type": "Video", - "url": "https://www.instagram.com/p/DQ7kFQrEeki", - "video_url": "https://scontent-fra3-2.cdninstagram.com/o1/v/t2/f2/m86/AQPEFeiCW6XBves4wKJDUVPj7tkMIkQfclSs49Fh0UUQsrjDtPJj-Ywl0Wk0_ZtuUUsAmu8g6b7bup0uTb__F99GssFlxWQujqqMR9Y.mp4?_nc_cat=1&_nc_sid=5e9851&_nc_ht=scontent-fra3-2.cdninstagram.com&_nc_ohc=O7I-tMGMR0AQ7kNvwHPWWuX&efg=eyJ2ZW5jb2RlX3RhZyI6Inhwdl9wcm9ncmVzc2l2ZS5JTlNUQUdSQU0uQ0xJUFMuQzMuNzIwLmRhc2hfYmFzZWxpbmVfMV92MSIsInhwdl9hc3NldF9pZCI6MTQ2MzEzMjc1ODEwMTM2NCwiYXNzZXRfYWdlX2RheXMiOjgsInZpX3VzZWNhc2VfaWQiOjEwMDk5LCJkdXJhdGlvbl9zIjo0MCwidXJsZ2VuX3NvdXJjZSI6Ind3dyJ9&ccb=17-1&vs=cd63dcc06f8fa02b&_nc_vs=HBksFQIYUmlnX3hwdl9yZWVsc19wZXJtYW5lbnRfc3JfcHJvZC9CNzQ5QjNFRDA2NzM4MTRDMUVFRDdGNkMyRUUxQTQ4OF92aWRlb19kYXNoaW5pdC5tcDQVAALIARIAFQIYOnBhc3N0aHJvdWdoX2V2ZXJzdG9yZS9HTm0yaFNKQjFkUnpIdWxaQUtMZmRPR3ZyUll2YnN0VEFRQUYVAgLIARIAKAAYABsCiAd1c2Vfb2lsATEScHJvZ3Jlc3NpdmVfcmVjaXBlATEVAAAm6LXyxMStmQUVAigCQzMsF0BECHKwIMScGBJkYXNoX2Jhc2VsaW5lXzFfdjERAHX-B2XmnQEA&_nc_gid=cw6-_j-JhTMw7N2bbykfug&_nc_zt=28&oh=00_AfiR5q0MZWJvoUBruxd5zgRoy-zvcyWXmsDx6iWUCg2Oyw&oe=69213AC9", - "is_pinned": false - }, - { - "caption": "Flipping through one of @artbythuraya\u2019s (Thuraya) sketchbooks like\u2026 \u270f\ufe0f\ud83d\udcda \n\nThe artist and graphic designer has been sketching and drawing for as long as she can remember. \u201cI love finding interesting color palettes and I\u2019m always drawn to colorful drawings and designs,\u201d says Thuraya.\n\nHer cure for artist\u2019s block? \u201cI like to paint some pages with neon pink or orange first so it feels less intimidating to draw or paint on them.\u201d \ud83c\udfa8\n \nVideo by @artbythuraya \nMusic by @8salamanda8", - "comments": 4132, - "datetime": "2025-11-11T17:08:16.000Z", - "id": "3763639257256696654", - "image_url": "https://scontent-fra3-2.cdninstagram.com/v/t51.2885-15/574669560_18680273659001321_1858701553672700147_n.jpg?stp=dst-jpg_e15_fr_p1080x1080_tt6&_nc_ht=scontent-fra3-2.cdninstagram.com&_nc_cat=1&_nc_oc=Q6cZ2QHRNKnpTyp3nOTa90wqCPSVZpi-KuApYBSwHsZkqNswtqlwIfFChTfLlBJQSDbpzdg&_nc_ohc=0LVdCDuRBp4Q7kNvwGkIu4w&_nc_gid=cw6-_j-JhTMw7N2bbykfug&edm=AOQ1c0wBAAAA&ccb=7-5&oh=00_AfjWpj0PsyTFXb6J98IdULJjluXkJ8gaFz_2YZhI_vU6Mw&oe=69253403&_nc_sid=8b3546", - "likes": 305332, - "content_type": "Video", - "url": "https://www.instagram.com/p/DQ7Ifa_gBtO", - "video_url": "https://scontent-fra3-2.cdninstagram.com/o1/v/t2/f2/m86/AQPJ5m9jYNnVN2_xKT8iKe1InFL-S2TQF5gqn9H9wncP2xnTwvs3Cg41QhXRm7jFOafn0W6A5QzvDN75IYlmXoRpT15P7FWRdfC5JV4.mp4?_nc_cat=111&_nc_sid=5e9851&_nc_ht=scontent-fra3-2.cdninstagram.com&_nc_ohc=Jnm96UiO86cQ7kNvwFHkKJG&efg=eyJ2ZW5jb2RlX3RhZyI6Inhwdl9wcm9ncmVzc2l2ZS5JTlNUQUdSQU0uQ0xJUFMuQzMuNzIwLmRhc2hfYmFzZWxpbmVfMV92MSIsInhwdl9hc3NldF9pZCI6MTM4NzcyNzY2NjIwMzYzNCwiYXNzZXRfYWdlX2RheXMiOjgsInZpX3VzZWNhc2VfaWQiOjEwMDk5LCJkdXJhdGlvbl9zIjo2LCJ1cmxnZW5fc291cmNlIjoid3d3In0%3D&ccb=17-1&_nc_gid=cw6-_j-JhTMw7N2bbykfug&_nc_zt=28&vs=434223c562bfcfd4&_nc_vs=HBksFQIYUmlnX3hwdl9yZWVsc19wZXJtYW5lbnRfc3JfcHJvZC9GQjRCREY0QjcyRkRCNzBCMzkwMDU5N0Q2NjEzQkZBRV92aWRlb19kYXNoaW5pdC5tcDQVAALIARIAFQIYOnBhc3N0aHJvdWdoX2V2ZXJzdG9yZS9HTmpfa1NMelJWLWsyeVlFQUFtRDhHd1FLejVvYnN0VEFRQUYVAgLIARIAKAAYABsCiAd1c2Vfb2lsATEScHJvZ3Jlc3NpdmVfcmVjaXBlATEVAAAm5K-26bCI9wQVAigCQzMsF0AYqfvnbItEGBJkYXNoX2Jhc2VsaW5lXzFfdjERAHX-B2XmnQEA&oh=00_AfgpVH9AzIPOseNRW-ZSvc0hyEs2zbaNZD9YFS0piiApug&oe=69213EB4", - "is_pinned": false - }, - { - "caption": "@ariana_greenblatt\u2019s (Ariana Greenblatt) camera roll is pure magic \ud83e\ude84\u2728\u2063\n \u2063\nIn today\u2019s episode of #WhatsInMyCameraRoll, the actress shows off photos from:\u2063\n \u2063\n\n\ud83e\uddc0 a three-hour hunt for mac and cheese with @dominic.sessa (Dominic Sessa)\u2063\n\ud83e\udee3 stunt work gone wrong\u2063\n\ud83c\udfa5 never-before-seen BTS of her new movie @nysmmovie (\u201cNow You See Me: Now You Don\u2019t\u201d)", - "comments": 4969, - "datetime": "2025-11-10T20:03:10.000Z", - "id": "3763002910675382648", - "image_url": "https://scontent-fra3-2.cdninstagram.com/v/t51.2885-15/580702200_18679965823001321_2764781517024588673_n.jpg?stp=dst-jpg_e35_p1080x1080_sh0.08_tt6&_nc_ht=scontent-fra3-2.cdninstagram.com&_nc_cat=1&_nc_oc=Q6cZ2QHRNKnpTyp3nOTa90wqCPSVZpi-KuApYBSwHsZkqNswtqlwIfFChTfLlBJQSDbpzdg&_nc_ohc=B_myOTv3LEcQ7kNvwFk_Mw_&_nc_gid=cw6-_j-JhTMw7N2bbykfug&edm=AOQ1c0wBAAAA&ccb=7-5&oh=00_Afh4ZryUf08EQoG5a4BZYX8MsOmNV1po_eGIcQ487-JMcA&oe=69251893&_nc_sid=8b3546", - "likes": 321015, - "content_type": "Video", - "url": "https://www.instagram.com/p/DQ43zXDkTF4", - "video_url": "https://scontent-fra5-2.cdninstagram.com/o1/v/t2/f2/m86/AQPF9xQpIA1Lx413WZGH6TTatp3DDVZe4tzaKn4Ijcw_ZttODA7zLD8ULhNlA-vHSw6q4WTsBzqcfsUz4auU0iSr8DUT3SPg3fvC5n8.mp4?_nc_cat=109&_nc_sid=5e9851&_nc_ht=scontent-fra5-2.cdninstagram.com&_nc_ohc=jEJV1ukrYqMQ7kNvwE6dMre&efg=eyJ2ZW5jb2RlX3RhZyI6Inhwdl9wcm9ncmVzc2l2ZS5JTlNUQUdSQU0uQ0xJUFMuQzMuNzIwLmRhc2hfYmFzZWxpbmVfMV92MSIsInhwdl9hc3NldF9pZCI6ODc2NzMyMDE4MzYxMDM5LCJhc3NldF9hZ2VfZGF5cyI6OSwidmlfdXNlY2FzZV9pZCI6MTAwOTksImR1cmF0aW9uX3MiOjE3MCwidXJsZ2VuX3NvdXJjZSI6Ind3dyJ9&ccb=17-1&vs=17f0d6dfa828a48f&_nc_vs=HBksFQIYUmlnX3hwdl9yZWVsc19wZXJtYW5lbnRfc3JfcHJvZC82NjRBRTdGOUE0MEJFNTIyQTdGMkYyQzJBNkI1N0NCNl92aWRlb19kYXNoaW5pdC5tcDQVAALIARIAFQIYOnBhc3N0aHJvdWdoX2V2ZXJzdG9yZS9HSDVha2lJXy1RRjh3endIQUlFOEh1VHFlbUpSYnN0VEFRQUYVAgLIARIAKAAYABsCiAd1c2Vfb2lsATEScHJvZ3Jlc3NpdmVfcmVjaXBlATEVAAAmnoukyMLYjgMVAigCQzMsF0BlQQ5WBBiTGBJkYXNoX2Jhc2VsaW5lXzFfdjERAHX-B2XmnQEA&_nc_gid=cw6-_j-JhTMw7N2bbykfug&_nc_zt=28&oh=00_AfgKOkWv2hBTWKD8iGRU7nTVYNimoKAKA1iM-Hd_sp8fFw&oe=69212F1E", - "is_pinned": false - } - ], - "profile_image_link": "https://scontent-fra3-2.cdninstagram.com/v/t51.2885-19/550891366_18667771684001321_1383210656577177067_n.jpg?stp=dst-jpg_s320x320_tt6&efg=eyJ2ZW5jb2RlX3RhZyI6InByb2ZpbGVfcGljLmRqYW5nby4xMDgwLmMyIn0&_nc_ht=scontent-fra3-2.cdninstagram.com&_nc_cat=1&_nc_oc=Q6cZ2QHRNKnpTyp3nOTa90wqCPSVZpi-KuApYBSwHsZkqNswtqlwIfFChTfLlBJQSDbpzdg&_nc_ohc=yJDuf_37I78Q7kNvwFwPPhF&_nc_gid=cw6-_j-JhTMw7N2bbykfug&edm=AOQ1c0wBAAAA&ccb=7-5&oh=00_AfiiZ25Szwb6Ps1PZVYRkQhp_UuzD1XQ5IB2relEmPEM2w&oe=69251AF1&_nc_sid=8b3546", - "profile_url": "https://instagram.com/instagram", - "profile_name": "Instagram", - "highlights_count": 15, - "full_name": "Instagram", - "is_private": false, - "url": "https://www.instagram.com/instagram", - "is_joined_recently": false, - "has_channel": false, - "partner_id": "25025320", - "business_address": null, - "related_accounts": [ - { - "id": "47913961291", - "profile_name": "\uc870\uc720\ub9ac JO YURI", - "is_private": false, - "is_verified": true, - "profile_pic_url": "https://scontent-fra3-2.cdninstagram.com/v/t51.2885-19/448149897_318348131333718_5639948001191412494_n.jpg?stp=dst-jpg_s150x150_tt6&efg=eyJ2ZW5jb2RlX3RhZyI6InByb2ZpbGVfcGljLmRqYW5nby40OTcuYzIifQ&_nc_ht=scontent-fra3-2.cdninstagram.com&_nc_cat=1&_nc_oc=Q6cZ2QHRNKnpTyp3nOTa90wqCPSVZpi-KuApYBSwHsZkqNswtqlwIfFChTfLlBJQSDbpzdg&_nc_ohc=UrsCtrnb1W4Q7kNvwGSAzZ7&_nc_gid=cw6-_j-JhTMw7N2bbykfug&edm=AOQ1c0wBAAAA&ccb=7-5&oh=00_AfiBxEJ-HC3K_7Ec1qYH2P7vDhpeGwcFdBFUTdgRx6_f4w&oe=692517A5&_nc_sid=8b3546", - "user_name": "zo__glasss" - }, - { - "id": "52057517181", - "profile_name": "\u8a2d\u5b9a\u305b\u3076\u3093", - "is_private": false, - "is_verified": false, - "profile_pic_url": "https://scontent-fra3-2.cdninstagram.com/v/t51.2885-19/329419233_145796804994270_5889321886093160950_n.jpg?stp=dst-jpg_s150x150_tt6&efg=eyJ2ZW5jb2RlX3RhZyI6InByb2ZpbGVfcGljLmRqYW5nby4xMDgwLmMyIn0&_nc_ht=scontent-fra3-2.cdninstagram.com&_nc_cat=1&_nc_oc=Q6cZ2QHRNKnpTyp3nOTa90wqCPSVZpi-KuApYBSwHsZkqNswtqlwIfFChTfLlBJQSDbpzdg&_nc_ohc=wnUOQU9uh2UQ7kNvwEHIFeZ&_nc_gid=cw6-_j-JhTMw7N2bbykfug&edm=AOQ1c0wBAAAA&ccb=7-5&oh=00_Afh4oZ06-S8RWGQZlPWSMs41jBbXp7G3utpz8L72ApZXYw&oe=69251676&_nc_sid=8b3546", - "user_name": "settei.seven" - }, - { - "id": "61519339885", - "profile_name": "ILLIT \uc544\uc77c\ub9bf", - "is_private": false, - "is_verified": true, - "profile_pic_url": "https://scontent-fra3-2.cdninstagram.com/v/t51.2885-19/571115836_17951810346051886_1465137572491758307_n.jpg?stp=dst-jpg_s150x150_tt6&efg=eyJ2ZW5jb2RlX3RhZyI6InByb2ZpbGVfcGljLmRqYW5nby40OTkuYzIifQ&_nc_ht=scontent-fra3-2.cdninstagram.com&_nc_cat=1&_nc_oc=Q6cZ2QHRNKnpTyp3nOTa90wqCPSVZpi-KuApYBSwHsZkqNswtqlwIfFChTfLlBJQSDbpzdg&_nc_ohc=mLZMFzfMwYYQ7kNvwEgmfTe&_nc_gid=cw6-_j-JhTMw7N2bbykfug&edm=AOQ1c0wBAAAA&ccb=7-5&oh=00_Afje0N18lXuD49fwq0rvEs-JGaAvMt0ri6CLrNm7zcuPYw&oe=692518A9&_nc_sid=8b3546", - "user_name": "illit_official" - }, - { - "id": "61944716934", - "profile_name": "TWS (\ud22c\uc5b4\uc2a4)", - "is_private": false, - "is_verified": true, - "profile_pic_url": "https://scontent-fra3-2.cdninstagram.com/v/t51.2885-19/560548764_17943106626068935_7992087485001898401_n.jpg?stp=dst-jpg_s150x150_tt6&efg=eyJ2ZW5jb2RlX3RhZyI6InByb2ZpbGVfcGljLmRqYW5nby43NTAuYzIifQ&_nc_ht=scontent-fra3-2.cdninstagram.com&_nc_cat=1&_nc_oc=Q6cZ2QHRNKnpTyp3nOTa90wqCPSVZpi-KuApYBSwHsZkqNswtqlwIfFChTfLlBJQSDbpzdg&_nc_ohc=yNI2quk4ALwQ7kNvwFSuXyM&_nc_gid=cw6-_j-JhTMw7N2bbykfug&edm=AOQ1c0wBAAAA&ccb=7-5&oh=00_Afj3-f6IUVUlNYKKuHB841POvSnMR8vUbYj6S2LWfztcnQ&oe=69250311&_nc_sid=8b3546", - "user_name": "tws_pledis" - }, - { - "id": "11927071408", - "profile_name": "\u110b\u1175\u11b7\u1109\u1175\u110b\u116a\u11ab", - "is_private": false, - "is_verified": true, - "profile_pic_url": "https://scontent-fra3-2.cdninstagram.com/v/t51.2885-19/470924210_631456425886325_6886504717911321733_n.jpg?stp=dst-jpg_s150x150_tt6&efg=eyJ2ZW5jb2RlX3RhZyI6InByb2ZpbGVfcGljLmRqYW5nby4xMDUxLmMyIn0&_nc_ht=scontent-fra3-2.cdninstagram.com&_nc_cat=1&_nc_oc=Q6cZ2QHRNKnpTyp3nOTa90wqCPSVZpi-KuApYBSwHsZkqNswtqlwIfFChTfLlBJQSDbpzdg&_nc_ohc=KKfPRFDBSJgQ7kNvwGE_Fa5&_nc_gid=cw6-_j-JhTMw7N2bbykfug&edm=AOQ1c0wBAAAA&ccb=7-5&oh=00_Afjmtxy_Cq7lhOa-YMVAY-37jRLA47gadQmQcbi7UI1C_A&oe=692531D0&_nc_sid=8b3546", - "user_name": "yim_siwang" - }, - { - "id": "67066633135", - "profile_name": "Atrass\u3010\u30a2\u30c8\u30e9\u30b9\u3011", - "is_private": false, - "is_verified": false, - "profile_pic_url": "https://scontent-fra3-2.cdninstagram.com/v/t51.2885-19/447197239_473707615114615_6794268554276293899_n.jpg?stp=dst-jpg_s150x150_tt6&efg=eyJ2ZW5jb2RlX3RhZyI6InByb2ZpbGVfcGljLmRqYW5nby4xMDgwLmMyIn0&_nc_ht=scontent-fra3-2.cdninstagram.com&_nc_cat=1&_nc_oc=Q6cZ2QHRNKnpTyp3nOTa90wqCPSVZpi-KuApYBSwHsZkqNswtqlwIfFChTfLlBJQSDbpzdg&_nc_ohc=XsKyIhMs29cQ7kNvwH8zBNX&_nc_gid=cw6-_j-JhTMw7N2bbykfug&edm=AOQ1c0wBAAAA&ccb=7-5&oh=00_AfjJ7BiSgOcx4eZlTnCWgF5VA5_JlZrbyeeBTUtHunFJOA&oe=692530BA&_nc_sid=8b3546", - "user_name": "atrass_wingashan" - } - ], - "email_address": null, - "timestamp": "2025-11-20T16:54:51.664Z", - "input": { - "url": "https://www.instagram.com/instagram" - } -} \ No newline at end of file diff --git a/tests/samples/linkedin/profile.json b/tests/samples/linkedin/profile.json deleted file mode 100644 index ed81411..0000000 --- a/tests/samples/linkedin/profile.json +++ /dev/null @@ -1,407 +0,0 @@ -{ - "id": "williamhgates", - "name": "Bill Gates", - "city": "Seattle, Washington, United States", - "country_code": "US", - "position": "Chair, Gates Foundation and Founder, Breakthrough Energy", - "about": "Chair of the Gates Foundation. Founder of Breakthrough Energy. Co-founder of Microsoft. Voracious reader. Avid traveler. Active blogger.", - "posts": [ - { - "title": "Saving lives, cutting emissions, and staying resilient in a warming world", - "attribution": "I recently published a long essay about climate change on the Gates Notes. This is the first of four newsletters I\u2019ll\u2026", - "img": "https://media.licdn.com/dms/image/v2/D5612AQHn1kRpjWsY7A/article-cover_image-shrink_720_1280/B56Zot1D3DG0AM-/0/1761705477503?e=2147483647&v=beta&t=wnrHrl7BgWHpsXAXFEOUbcmFE0tnxibZ-Ze5ESzbKMs", - "link": "https://www.linkedin.com/pulse/saving-lives-cutting-emissions-staying-resilient-warming-bill-gates-jstyc", - "created_at": "2025-10-29T00:00:00.000Z", - "interaction": "5,43 - 989 Comments", - "id": "7389128335357947904" - }, - { - "title": "We\u2019re closer than ever to eradicating polio", - "attribution": "..", - "img": "https://media.licdn.com/dms/image/v2/D5612AQHn1kRpjWsY7A/article-cover_image-shrink_720_1280/B56Zot1D3DG0AM-/0/1761705477503?e=2147483647&v=beta&t=wnrHrl7BgWHpsXAXFEOUbcmFE0tnxibZ-Ze5ESzbKMs", - "link": "https://www.linkedin.com/pulse/were-closer-than-ever-eradicating-polio-bill-gates-wyhac", - "created_at": "2025-10-18T00:00:00.000Z", - "interaction": "5,81 - 719 Comments", - "id": "7385166929856172032" - }, - { - "title": "Demystifying the science behind fission and fusion", - "attribution": "I\u2019m lucky to learn firsthand about some of the world\u2019s most cutting-edge technologies. I\u2019ve seen artificial\u2026", - "img": "https://media.licdn.com/dms/image/v2/D5612AQHn1kRpjWsY7A/article-cover_image-shrink_720_1280/B56Zot1D3DG0AM-/0/1761705477503?e=2147483647&v=beta&t=wnrHrl7BgWHpsXAXFEOUbcmFE0tnxibZ-Ze5ESzbKMs", - "link": "https://www.linkedin.com/pulse/demystifying-science-behind-fission-fusion-bill-gates-ylhic", - "created_at": "2025-10-11T00:00:00.000Z", - "interaction": "5,39 - 727 Comments", - "id": "7382558042824855552" - }, - { - "title": "Utah\u2019s hottest new power source is 15,000 feet below the ground", - "attribution": "When my son, Rory, was younger, we used to love visiting power plants together. It was the perfect father-son activity\u2026", - "img": "https://media.licdn.com/dms/image/v2/D5612AQHn1kRpjWsY7A/article-cover_image-shrink_720_1280/B56Zot1D3DG0AM-/0/1761705477503?e=2147483647&v=beta&t=wnrHrl7BgWHpsXAXFEOUbcmFE0tnxibZ-Ze5ESzbKMs", - "link": "https://www.linkedin.com/pulse/utahs-hottest-new-power-source-15000-feet-below-ground-bill-gates-otlwc", - "created_at": "2025-09-30T00:00:00.000Z", - "interaction": "5,99 - 661 Comments", - "id": "7378858122087616513" - }, - { - "title": "Why I\u2019m Still Optimistic About Global Health", - "attribution": "I recently wrote this essay for TIME Magazine about why I'm still optimistic about global health: One of humanity\u2019s\u2026", - "img": "https://media.licdn.com/dms/image/v2/D5612AQHn1kRpjWsY7A/article-cover_image-shrink_720_1280/B56Zot1D3DG0AM-/0/1761705477503?e=2147483647&v=beta&t=wnrHrl7BgWHpsXAXFEOUbcmFE0tnxibZ-Ze5ESzbKMs", - "link": "https://www.linkedin.com/pulse/why-im-still-optimistic-global-health-bill-gates-ji9xc", - "created_at": "2025-09-23T00:00:00.000Z", - "interaction": "4,33 - 765 Comments", - "id": "7376347643272343554" - }, - { - "title": "This is how a parasite helped build the CDC and changed public health forever", - "attribution": "I spend a lot of time thinking and worrying about malaria. After all, it\u2019s one of the big focuses of my work at the\u2026", - "img": "https://media.licdn.com/dms/image/v2/D5612AQHn1kRpjWsY7A/article-cover_image-shrink_720_1280/B56Zot1D3DG0AM-/0/1761705477503?e=2147483647&v=beta&t=wnrHrl7BgWHpsXAXFEOUbcmFE0tnxibZ-Ze5ESzbKMs", - "link": "https://www.linkedin.com/pulse/how-parasite-helped-build-cdc-changed-public-health-forever-gates-xvhlc", - "created_at": "2025-08-26T00:00:00.000Z", - "interaction": "4,10 - 613 Comments", - "id": "7366207310018375680" - }, - { - "title": "One of the most unique and supportive learning environments I have ever heard of", - "attribution": "When I was a kid, I couldn\u2019t sit still. My teachers used to get mad at me for squirming in my chair and chewing on my\u2026", - "img": "https://media.licdn.com/dms/image/v2/D5612AQHn1kRpjWsY7A/article-cover_image-shrink_720_1280/B56Zot1D3DG0AM-/0/1761705477503?e=2147483647&v=beta&t=wnrHrl7BgWHpsXAXFEOUbcmFE0tnxibZ-Ze5ESzbKMs", - "link": "https://www.linkedin.com/pulse/one-most-unique-supportive-learning-environments-i-have-bill-gates-e3fcc", - "created_at": "2025-08-13T00:00:00.000Z", - "interaction": "5,59 - 901 Comments", - "id": "7361457006081134592" - }, - { - "title": "This heroic nurse climbs 1000-foot ladders to save lives", - "attribution": "How do you get to work? Some people roll out of bed and move 10 feet to their desk. Others walk to the office or take\u2026", - "img": "https://media.licdn.com/dms/image/v2/D5612AQHn1kRpjWsY7A/article-cover_image-shrink_720_1280/B56Zot1D3DG0AM-/0/1761705477503?e=2147483647&v=beta&t=wnrHrl7BgWHpsXAXFEOUbcmFE0tnxibZ-Ze5ESzbKMs", - "link": "https://www.linkedin.com/pulse/heroic-nurse-climbs-1000-foot-ladders-save-lives-bill-gates-gh0ic", - "created_at": "2025-07-31T00:00:00.000Z", - "interaction": "5,85 - 823 Comments", - "id": "7356808124818735104" - }, - { - "title": "A gut-wrenching problem we can solve", - "attribution": "In 1997, I came across a New York Times column by Nick Kristof that stopped me in my tracks. The headline was \u201cFor\u2026", - "img": "https://media.licdn.com/dms/image/v2/D5612AQHn1kRpjWsY7A/article-cover_image-shrink_720_1280/B56Zot1D3DG0AM-/0/1761705477503?e=2147483647&v=beta&t=wnrHrl7BgWHpsXAXFEOUbcmFE0tnxibZ-Ze5ESzbKMs", - "link": "https://www.linkedin.com/pulse/gut-wrenching-problem-we-can-solve-bill-gates-ahczc", - "created_at": "2025-07-27T00:00:00.000Z", - "interaction": "6,27 - 1,154 Comments", - "id": "7354909425704292352" - }, - { - "title": "A book about tuberculosis, and everything else", - "attribution": "What do Adirondack chairs, Stetson hats, the city of Pasadena, and World War I have in common? According to John Green,\u2026", - "img": "https://media.licdn.com/dms/image/v2/D5612AQHn1kRpjWsY7A/article-cover_image-shrink_720_1280/B56Zot1D3DG0AM-/0/1761705477503?e=2147483647&v=beta&t=wnrHrl7BgWHpsXAXFEOUbcmFE0tnxibZ-Ze5ESzbKMs", - "link": "https://www.linkedin.com/pulse/book-tuberculosis-everything-else-bill-gates-5ibhc", - "created_at": "2025-07-24T00:00:00.000Z", - "interaction": "4,40 - 668 Comments", - "id": "7354250885624946688" - } - ], - "current_company": { - "name": "Gates Foundation", - "company_id": "gates-foundation", - "title": "Co-chair", - "location": null - }, - "experience": [ - { - "title": "Co-chair", - "description_html": null, - "start_date": "2000", - "end_date": "Present", - "company": "Gates Foundation", - "company_id": "gates-foundation", - "url": "https://www.linkedin.com/company/gates-foundation", - "company_logo_url": "https://media.licdn.com/dms/image/v2/D560BAQEgMqqFTd40Tg/company-logo_100_100/company-logo_100_100/0/1736784969376/bill__melinda_gates_foundation_logo?e=2147483647&v=beta&t=2JH2cMcZms60vPAMbvVZyMeYXosQ1Jjy5axDlyeQ1Ww" - }, - { - "title": "Founder", - "description_html": null, - "start_date": "2015", - "end_date": "Present", - "company": "Breakthrough Energy", - "company_id": "breakthrough-energy", - "url": "https://www.linkedin.com/company/breakthrough-energy", - "company_logo_url": "https://media.licdn.com/dms/image/v2/D560BAQFRMYiQN7-2kA/company-logo_100_100/B56ZoI4SGPI0AQ-/0/1761085563539/breakthrough_energy_logo?e=2147483647&v=beta&t=J6RbEvs17fl1uiEaXQm0hmXy4imx36mV_Hu80JcR1DE" - }, - { - "title": "Co-founder", - "description_html": null, - "start_date": "1975", - "end_date": "Present", - "company": "Microsoft", - "company_id": "microsoft", - "url": "https://www.linkedin.com/company/microsoft", - "company_logo_url": "https://media.licdn.com/dms/image/v2/D560BAQH32RJQCl3dDQ/company-logo_100_100/B56ZYQ0mrGGoAU-/0/1744038948046/microsoft_logo?e=2147483647&v=beta&t=rr_7_bFRKp6umQxIHErPOZHtR8dMPIYeTjlKFdotJBY" - } - ], - "url": "https://tr.linkedin.com/in/williamhgates", - "people_also_viewed": [ - { - "profile_link": "https://www.linkedin.com/in/melindagates", - "name": "Melinda French Gates", - "about": null, - "location": "United States" - }, - { - "profile_link": "https://www.linkedin.com/in/tyleralterman", - "name": "Tyler Alterman", - "about": null, - "location": "Brooklyn, NY" - }, - { - "profile_link": "https://www.linkedin.com/in/toddjduckett", - "name": "Todd J. Duckett", - "about": null, - "location": "Lansing, MI" - }, - { - "profile_link": "https://is.linkedin.com/in/hallatomasdottir", - "name": "Halla Tomasdottir", - "about": null, - "location": "Iceland" - }, - { - "profile_link": "https://www.linkedin.com/in/matthew-swift-8ba7529", - "name": "Matthew Swift", - "about": null, - "location": "Palm Beach, FL" - }, - { - "profile_link": "https://www.linkedin.com/in/petefishman", - "name": "Peter Fishman", - "about": null, - "location": "San Francisco, CA" - }, - { - "profile_link": "https://www.linkedin.com/in/sherryb", - "name": "\u2726 Sherry Whitaker Budziak", - "about": null, - "location": "Deerfield, IL" - }, - { - "profile_link": "https://www.linkedin.com/in/tonyteravainen", - "name": "Tony Teravainen PMP CSSBB", - "about": null, - "location": "San Diego, CA" - }, - { - "profile_link": "https://www.linkedin.com/in/charlesmarohn", - "name": "Charles Marohn", - "about": null, - "location": "Brainerd, MN" - }, - { - "profile_link": "https://www.linkedin.com/in/schm1tt", - "name": "Patrick Schmitt", - "about": null, - "location": "New York, NY" - }, - { - "profile_link": "https://www.linkedin.com/in/melindalackey", - "name": "Melinda Lackey", - "about": null, - "location": "New York, NY" - }, - { - "profile_link": "https://www.linkedin.com/in/bill-cronin-5490492", - "name": "Bill Cronin", - "about": null, - "location": "Odessa, FL" - }, - { - "profile_link": "https://www.linkedin.com/in/ezohn", - "name": "Ethan Zohn", - "about": null, - "location": "Hillsborough County, NH" - }, - { - "profile_link": "https://www.linkedin.com/in/gary-taubes-942a6459", - "name": "Gary Taubes", - "about": null, - "location": "Oakland, CA" - }, - { - "profile_link": "https://www.linkedin.com/in/sharonhenifin", - "name": "Sharon Henifin, CLC, CN-BA", - "about": null, - "location": "Portland, Oregon Metropolitan Area" - }, - { - "profile_link": "https://www.linkedin.com/in/josephrrusso", - "name": "Joseph Russo", - "about": null, - "location": "West Palm Beach, FL" - }, - { - "profile_link": "https://www.linkedin.com/in/jasongrad", - "name": "Jason Grad", - "about": null, - "location": "New York, NY" - }, - { - "profile_link": "https://www.linkedin.com/in/mrdaikensjr", - "name": "Dwayne Aikens Jr.", - "about": null, - "location": "Oakland, CA" - }, - { - "profile_link": "https://www.linkedin.com/in/erikrees", - "name": "Erik Rees", - "about": null, - "location": "Rancho Santa Margarita, CA" - } - ], - "educations_details": "Harvard University", - "education": [ - { - "title": "Harvard University", - "url": "https://www.linkedin.com/school/harvard-university/?trk=public_profile_school_profile-section-card_image-click", - "start_year": "1973", - "end_year": "1975", - "description": null, - "description_html": null, - "institute_logo_url": "https://media.licdn.com/dms/image/v2/C4E0BAQF5t62bcL0e9g/company-logo_100_100/company-logo_100_100/0/1631318058235?e=2147483647&v=beta&t=Ye1klXowyo8TIcnkhTlmORgiA5ZywvooNihDMnx5urQ" - }, - { - "title": "Lakeside School", - "url": "https://www.linkedin.com/school/lakeside-school/?trk=public_profile_school_profile-section-card_image-click", - "description": null, - "description_html": null, - "institute_logo_url": "https://media.licdn.com/dms/image/v2/D560BAQGFmOQmzpxg9A/company-logo_100_100/company-logo_100_100/0/1683732883164/lakeside_school_logo?e=2147483647&v=beta&t=EmadOLH7MckKZvCCrgmAOikCRtzVRtqqN4PJi35CNyo" - } - ], - "avatar": "https://media.licdn.com/dms/image/v2/D5603AQF-RYZP55jmXA/profile-displayphoto-shrink_200_200/B56ZRi8g.aGsAY-/0/1736826818802?e=2147483647&v=beta&t=bKWfN6UwwtiCqFWsG7rBELbd48qJOAMLdxhBzzkJV0k", - "followers": 39312887, - "connections": 8, - "current_company_company_id": "gates-foundation", - "current_company_name": "Gates Foundation", - "location": "Seattle", - "input_url": "https://www.linkedin.com/in/williamhgates", - "linkedin_id": "williamhgates", - "activity": [ - { - "interaction": "Shared by Bill Gates", - "link": "https://www.linkedin.com/posts/williamhgates_luxwall-a-breakthrough-energybacked-company-activity-7397039090300289024-i8M3", - "title": "LuxWall, a Breakthrough Energy\u2013backed company, is growing in Detroit\u2014and bringing new jobs along with it.", - "img": "https://static.licdn.com/aero-v1/sc/h/53n89ecoxpr1qrki1do3alazb", - "id": "7397039090300289024" - }, - { - "interaction": "Shared by Bill Gates", - "link": "https://www.linkedin.com/posts/williamhgates_five-years-ago-just-two-months-after-my-activity-7396302164459102208-Kbj8", - "title": "Five years ago, just two months after my dad died from Alzheimer's disease, I worked with a coalition of partners to create the Alzheimer's Disease\u2026", - "img": "https://media.licdn.com/dms/image/v2/D5622AQHgEdBt8av3CQ/feedshare-shrink_800/B56ZqTxoD2JYAg-/0/1763415851860?e=2147483647&v=beta&t=zCTCb6zxupuvG6lfR8wLNsSR3EqB6U_q8wRVDtTI0uY", - "id": "7396302164459102208" - }, - { - "interaction": "Shared by Bill Gates", - "link": "https://www.linkedin.com/posts/williamhgates_fighting-climate-change-requires-actions-activity-7393808373814685696-r4Ed", - "title": "Fighting climate change requires actions on two fronts: cutting emissions and protecting vulnerable people. I will continue to invest billions in\u2026", - "img": "https://media.licdn.com/dms/image/v2/D5622AQHHcm91usLudw/feedshare-shrink_2048_1536/B56ZpwViXrJQAw-/0/1762821285730?e=2147483647&v=beta&t=xBCPzIccwCP53aFG20U2hyamr2xJphdmDENxCqeTQoc", - "id": "7393808373814685696" - }, - { - "interaction": "Shared by Bill Gates", - "link": "https://www.linkedin.com/posts/williamhgates_my-commitment-to-fightingand-solvingclimate-activity-7393404126505689088-fiqd", - "title": "My commitment to fighting\u2014and solving\u2014climate change has not wavered. In addition to the billions I am investing in innovation that will help the\u2026", - "img": "https://static.licdn.com/aero-v1/sc/h/53n89ecoxpr1qrki1do3alazb", - "id": "7393404126505689088" - }, - { - "interaction": "Shared by Bill Gates", - "link": "https://www.linkedin.com/posts/williamhgates_sa-becomes-the-first-african-country-to-register-activity-7393120672111185920-SKf2", - "title": "South Africa\u2019s Lenacapavir rollout is a signal that progress is possible when innovation meets urgency.", - "img": "https://media.licdn.com/dms/image/sync/v2/D4D27AQFNkSDu_tpZ7g/articleshare-shrink_1280_800/B4DZokVReGJIAQ-/0/1761596917780?e=2147483647&v=beta&t=2XH0BTMGQJgud_VJq-Oyfz5VVFcOjzPQmlKWvkiI0GQ", - "id": "7393120672111185920" - }, - { - "interaction": "Shared by Bill Gates", - "link": "https://www.linkedin.com/posts/williamhgates_to-strengthen-human-welfare-globally-we-activity-7392748129042907137-9s6_", - "title": "To strengthen human welfare globally, we must help the most vulnerable communities adapt to a warming planet while continuing to invest in critical\u2026", - "img": "https://static.licdn.com/aero-v1/sc/h/53n89ecoxpr1qrki1do3alazb", - "id": "7392748129042907137" - }, - { - "interaction": "Shared by Bill Gates", - "link": "https://www.linkedin.com/posts/williamhgates_when-i-started-breakthrough-energy-the-world-activity-7392049630605099008-WCuo", - "title": "When I started Breakthrough Energy, the world needed affordable clean energy solutions that didn\u2019t exist yet. \u200b \u200b Affordable, reliable, clean energy\u2026", - "img": "https://media.licdn.com/dms/image/v2/D4D05AQGF8BR-A7TzTw/videocover-high/B4DZpXV5dsG8BU-/0/1762401954068?e=2147483647&v=beta&t=p-h5YEqqlB4cWDe0JicwMiFaNOi_iHMZSdG3L6PGjzo", - "id": "7392049630605099008" - }, - { - "interaction": "Shared by Bill Gates", - "link": "https://www.linkedin.com/posts/williamhgates_today-i-visited-the-alzheimers-therapeutic-activity-7391650830199775233-cR81", - "title": "Today I visited the Alzheimer's Therapeutic Research Institute (ATRI) at USC, led by Dr. Paul Aisen, to learn more about the current landscape of\u2026", - "img": "https://media.licdn.com/dms/image/v2/D5622AQEFYSq5diGoKg/feedshare-shrink_800/B56ZpRrQg3HQAk-/0/1762306886686?e=2147483647&v=beta&t=5dpkTj8DyD87d5jbw-ylidifhkJdkVtMwdfKVu0l3cQ", - "id": "7391650830199775233" - }, - { - "interaction": "Shared by Bill Gates", - "link": "https://www.linkedin.com/posts/williamhgates_were-on-the-brink-of-eradicating-polio-for-activity-7391179821646716928-EGiI", - "title": "We\u2019re on the brink of eradicating polio for good. It would be a deadly mistake to back down from the fight now.", - "img": "https://media.licdn.com/dms/image/v2/D5605AQFMop8kHgEtkQ/videocover-high/B56ZpK.wx4HYBU-/0/1762194575348?e=2147483647&v=beta&t=LQJGu7ZZLYlFrnGcfHpwAhnl0K_bIn94RQT4_hZh5Z0", - "id": "7391179821646716928" - }, - { - "interaction": "Shared by Bill Gates", - "link": "https://www.linkedin.com/posts/williamhgates_this-is-an-exciting-partnership-with-alzheimers-activity-7390903402932813824-mTWv", - "title": "This is an exciting partnership with Alzheimer's Research UK. Answering these questions could change the course of our fight against Alzheimer\u2019s.", - "img": "https://static.licdn.com/aero-v1/sc/h/53n89ecoxpr1qrki1do3alazb", - "id": "7390903402932813824" - }, - { - "interaction": "Liked by Bill Gates", - "link": "https://www.linkedin.com/posts/alzheimer%27s-research-uk_today-marks-a-pivotal-moment-in-the-global-activity-7387141173267816448-bf5I", - "title": "Today marks a pivotal moment in the global fight against dementia. Alzheimer\u2019s Research UK, alongside Gates Ventures are proud to launch a\u2026", - "img": "https://media.licdn.com/dms/image/v2/D4E10AQELRUrvrLBxFQ/ads-video-thumbnail_720_1280/B4EZoRlo13KsAc-/0/1761231670956?e=2147483647&v=beta&t=skARaYStlXOrE0cNE5CgdPYQx4cELDW8kdRu6XuacsI", - "id": "7387141173267816448" - }, - { - "interaction": "Shared by Bill Gates", - "link": "https://www.linkedin.com/posts/williamhgates_im-grateful-for-people-like-john-and-nancy-activity-7390459116651155457-2aYl", - "title": "I\u2019m grateful for people like John and Nancy from Rotary International\u2014leaders whose courage and commitment bring us closer to a polio-free world\u2026", - "img": "https://static.licdn.com/aero-v1/sc/h/53n89ecoxpr1qrki1do3alazb", - "id": "7390459116651155457" - }, - { - "interaction": "Liked by Bill Gates", - "link": "https://www.linkedin.com/posts/nancy-barbee-18a6308_i-sat-next-to-bill-gates-at-the-gates-foundation-activity-7388529939463180288-lJiu", - "title": "I sat next to Bill Gates at the Gates Foundation media event for World Polio Day 2025 Bill is the person who inspired me to start leading Rotarians\u2026", - "img": "https://media.licdn.com/dms/image/v2/D4E22AQFXY_wl5a3-Hg/feedshare-shrink_800/B4EZokJLI7GYAg-/0/1761542977553?e=2147483647&v=beta&t=T9wB3225_8CIbeVs6KZae4GRuM3jJHNAdZCXsHm76Hk", - "id": "7388529939463180288" - }, - { - "interaction": "Shared by Bill Gates", - "link": "https://www.linkedin.com/posts/williamhgates_a-new-approach-for-the-worlds-climate-strategy-activity-7390110248264466432-tub6", - "title": "Climate change is one of the most pressing challenges the world faces today. The good news is that we've made incredible progress in recent years\u2026", - "img": "https://media.licdn.com/dms/image/sync/v2/D4E27AQEwAcMGPj_kKA/articleshare-shrink_1280_800/B4EZopJuU0KoAQ-/0/1761627006826?e=2147483647&v=beta&t=dx6lfbhpJMJ2nd-K-gmIFrS_odoBhduCEfYVgUhGolY", - "id": "7390110248264466432" - }, - { - "interaction": "Shared by Bill Gates", - "link": "https://www.linkedin.com/posts/williamhgates_congratulations-on-this-well-deserved-award-activity-7388262728068452352-N6SR", - "title": "Congratulations on this well-deserved award. I\u2019m grateful for your leadership and commitment to ensuring everyone can live a healthy, prosperous life.", - "img": "https://static.licdn.com/aero-v1/sc/h/53n89ecoxpr1qrki1do3alazb", - "id": "7388262728068452352" - } - ], - "linkedin_num_id": "251749025", - "banner_image": "https://media.licdn.com/dms/image/v2/D5616AQEjhPbTCeblYg/profile-displaybackgroundimage-shrink_200_800/B56ZcytR5SGsAc-/0/1748902420393?e=2147483647&v=beta&t=a-tBeZkxzWTHWYY6MAjxt0oTEuxlW33EUkK3gm5_te4", - "honors_and_awards": null, - "similar_profiles": [], - "default_avatar": false, - "memorialized_account": false, - "bio_links": [ - { - "title": "Blog", - "link": "https://gatesnot.es/sourcecode-li" - } - ], - "first_name": "Bill", - "last_name": "Gates", - "timestamp": "2025-11-20T17:04:28.062Z", - "input": { - "url": "https://www.linkedin.com/in/williamhgates" - } -} \ No newline at end of file diff --git a/tests/samples/serp/google.json b/tests/samples/serp/google.json deleted file mode 100644 index a6727ca..0000000 --- a/tests/samples/serp/google.json +++ /dev/null @@ -1,23 +0,0 @@ -[ - { - "position": 1, - "title": "Pizza Hut | Delivery & Carryout - No One OutPizzas The Hut!", - "url": "https://www.pizzahut.com/", - "description": "Discover classic & new menu items, find deals and enjoy seamless ordering for delivery and carryout. No One OutPizzas the Hut\u00ae.", - "displayed_url": "https://www.pizzahut.com" - }, - { - "position": 2, - "title": "Pizza", - "url": "https://en.wikipedia.org/wiki/Pizza", - "description": "Pizza is an Italian dish typically consisting of a flat base of leavened wheat-based dough topped with tomato, cheese, and other ingredients, baked at a ...", - "displayed_url": "https://en.wikipedia.org \u203a wiki \u203a Pizza" - }, - { - "position": 3, - "title": "Domino's: Pizza Delivery & Carryout, Pasta, Wings & More", - "url": "https://www.dominos.com/", - "description": "PRICES HIGHER FOR SOME LOCATIONS. Treat yo self to our best, most premium medium Specialty Pizzas for just $9.99 each when you Mix & Match.", - "displayed_url": "https://www.dominos.com" - } -] \ No newline at end of file diff --git a/tests/samples/web_unlocker/country_targeting.html b/tests/samples/web_unlocker/country_targeting.html deleted file mode 100644 index c07a7cf..0000000 --- a/tests/samples/web_unlocker/country_targeting.html +++ /dev/null @@ -1,17 +0,0 @@ -{ - "headers": { - "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", - "Accept-Encoding": "gzip, deflate, br, zstd", - "Accept-Language": "en-US,en;q=0.9", - "Host": "httpbin.org", - "Sec-Ch-Ua": "\"Chromium\";v=\"142\", \"Microsoft Edge\";v=\"142\", \"Not_A Brand\";v=\"99\"", - "Sec-Ch-Ua-Platform": "\"Windows\"", - "Sec-Fetch-Dest": "empty", - "Sec-Fetch-Mode": "cors", - "Sec-Fetch-Site": "none", - "Sec-Fetch-User": "?0", - "Upgrade-Insecure-Requests": "1", - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36 Edg/142.0.0.0", - "X-Amzn-Trace-Id": "Root=1-691f5229-7d5c92055198bdba39341a7f" - } -} diff --git a/tests/samples/web_unlocker/multiple_urls_1.html b/tests/samples/web_unlocker/multiple_urls_1.html deleted file mode 100644 index d55209d..0000000 --- a/tests/samples/web_unlocker/multiple_urls_1.html +++ /dev/null @@ -1,14 +0,0 @@ - - - - - -

Herman Melville - Moby-Dick

- -
-

- Availing himself of the mild, summer-cool weather that now reigned in these latitudes, and in preparation for the peculiarly active pursuits shortly to be anticipated, Perth, the begrimed, blistered old blacksmith, had not removed his portable forge to the hold again, after concluding his contributory work for Ahab's leg, but still retained it on deck, fast lashed to ringbolts by the foremast; being now almost incessantly invoked by the headsmen, and harpooneers, and bowsmen to do some little job for them; altering, or repairing, or new shaping their various weapons and boat furniture. Often he would be surrounded by an eager circle, all waiting to be served; holding boat-spades, pike-heads, harpoons, and lances, and jealously watching his every sooty movement, as he toiled. Nevertheless, this old man's was a patient hammer wielded by a patient arm. No murmur, no impatience, no petulance did come from him. Silent, slow, and solemn; bowing over still further his chronically broken back, he toiled away, as if toil were life itself, and the heavy beating of his hammer the heavy beating of his heart. And so it was.—Most miserable! A peculiar walk in this old man, a certain slight but painful appearing yawing in his gait, had at an early period of the voyage excited the curiosity of the mariners. And to the importunity of their persisted questionings he had finally given in; and so it came to pass that every one now knew the shameful story of his wretched fate. Belated, and not innocently, one bitter winter's midnight, on the road running between two country towns, the blacksmith half-stupidly felt the deadly numbness stealing over him, and sought refuge in a leaning, dilapidated barn. The issue was, the loss of the extremities of both feet. Out of this revelation, part by part, at last came out the four acts of the gladness, and the one long, and as yet uncatastrophied fifth act of the grief of his life's drama. He was an old man, who, at the age of nearly sixty, had postponedly encountered that thing in sorrow's technicals called ruin. He had been an artisan of famed excellence, and with plenty to do; owned a house and garden; embraced a youthful, daughter-like, loving wife, and three blithe, ruddy children; every Sunday went to a cheerful-looking church, planted in a grove. But one night, under cover of darkness, and further concealed in a most cunning disguisement, a desperate burglar slid into his happy home, and robbed them all of everything. And darker yet to tell, the blacksmith himself did ignorantly conduct this burglar into his family's heart. It was the Bottle Conjuror! Upon the opening of that fatal cork, forth flew the fiend, and shrivelled up his home. Now, for prudent, most wise, and economic reasons, the blacksmith's shop was in the basement of his dwelling, but with a separate entrance to it; so that always had the young and loving healthy wife listened with no unhappy nervousness, but with vigorous pleasure, to the stout ringing of her young-armed old husband's hammer; whose reverberations, muffled by passing through the floors and walls, came up to her, not unsweetly, in her nursery; and so, to stout Labor's iron lullaby, the blacksmith's infants were rocked to slumber. Oh, woe on woe! Oh, Death, why canst thou not sometimes be timely? Hadst thou taken this old blacksmith to thyself ere his full ruin came upon him, then had the young widow had a delicious grief, and her orphans a truly venerable, legendary sire to dream of in their after years; and all of them a care-killing competency. -

-
- - \ No newline at end of file diff --git a/tests/samples/web_unlocker/multiple_urls_2.html b/tests/samples/web_unlocker/multiple_urls_2.html deleted file mode 100644 index 53d90db..0000000 --- a/tests/samples/web_unlocker/multiple_urls_2.html +++ /dev/null @@ -1,24 +0,0 @@ -{ - "args": {}, - "data": "", - "files": {}, - "form": {}, - "headers": { - "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", - "Accept-Encoding": "gzip, deflate, br, zstd", - "Accept-Language": "en-US,en;q=0.9", - "Host": "httpbin.org", - "Sec-Ch-Ua": "\"Chromium\";v=\"142\", \"Google Chrome\";v=\"142\", \"Not_A Brand\";v=\"99\"", - "Sec-Ch-Ua-Mobile": "?0", - "Sec-Ch-Ua-Platform": "\"Windows\"", - "Sec-Fetch-Dest": "empty", - "Sec-Fetch-Mode": "cors", - "Sec-Fetch-Site": "none", - "Sec-Fetch-User": "?0", - "Upgrade-Insecure-Requests": "1", - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36", - "X-Amzn-Trace-Id": "Root=1-691f5225-54aa4c085727d04a6e2abdd8" - }, - "origin": "r43fc13031b33c14da638eac9dd957057", - "url": "https://httpbin.org/delay/1" -} diff --git a/tests/samples/web_unlocker/multiple_urls_3.html b/tests/samples/web_unlocker/multiple_urls_3.html deleted file mode 100644 index 21e5735..0000000 --- a/tests/samples/web_unlocker/multiple_urls_3.html +++ /dev/null @@ -1 +0,0 @@ -Example Domain

Example Domain

This domain is for use in documentation examples without needing permission. Avoid use in operations.

Learn more

diff --git a/tests/samples/web_unlocker/single_url_json.json b/tests/samples/web_unlocker/single_url_json.json deleted file mode 100644 index 0c69a4d..0000000 --- a/tests/samples/web_unlocker/single_url_json.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "status_code": 200, - "headers": { - "access-control-allow-credentials": "true", - "access-control-allow-origin": "*", - "content-type": "application/json", - "date": "Thu, 20 Nov 2025 17:38:43 GMT", - "server": "gunicorn/19.9.0", - "connection": "close", - "transfer-encoding": "chunked" - }, - "body": "{\n \"slideshow\": {\n \"author\": \"Yours Truly\", \n \"date\": \"date of publication\", \n \"slides\": [\n {\n \"title\": \"Wake up to WonderWidgets!\", \n \"type\": \"all\"\n }, \n {\n \"items\": [\n \"Why WonderWidgets are great\", \n \"Who buys WonderWidgets\"\n ], \n \"title\": \"Overview\", \n \"type\": \"all\"\n }\n ], \n \"title\": \"Sample Slide Show\"\n }\n}\n" -} \ No newline at end of file diff --git a/tests/samples/web_unlocker/single_url_raw.html b/tests/samples/web_unlocker/single_url_raw.html deleted file mode 100644 index d55209d..0000000 --- a/tests/samples/web_unlocker/single_url_raw.html +++ /dev/null @@ -1,14 +0,0 @@ - - - - - -

Herman Melville - Moby-Dick

- -
-

- Availing himself of the mild, summer-cool weather that now reigned in these latitudes, and in preparation for the peculiarly active pursuits shortly to be anticipated, Perth, the begrimed, blistered old blacksmith, had not removed his portable forge to the hold again, after concluding his contributory work for Ahab's leg, but still retained it on deck, fast lashed to ringbolts by the foremast; being now almost incessantly invoked by the headsmen, and harpooneers, and bowsmen to do some little job for them; altering, or repairing, or new shaping their various weapons and boat furniture. Often he would be surrounded by an eager circle, all waiting to be served; holding boat-spades, pike-heads, harpoons, and lances, and jealously watching his every sooty movement, as he toiled. Nevertheless, this old man's was a patient hammer wielded by a patient arm. No murmur, no impatience, no petulance did come from him. Silent, slow, and solemn; bowing over still further his chronically broken back, he toiled away, as if toil were life itself, and the heavy beating of his hammer the heavy beating of his heart. And so it was.—Most miserable! A peculiar walk in this old man, a certain slight but painful appearing yawing in his gait, had at an early period of the voyage excited the curiosity of the mariners. And to the importunity of their persisted questionings he had finally given in; and so it came to pass that every one now knew the shameful story of his wretched fate. Belated, and not innocently, one bitter winter's midnight, on the road running between two country towns, the blacksmith half-stupidly felt the deadly numbness stealing over him, and sought refuge in a leaning, dilapidated barn. The issue was, the loss of the extremities of both feet. Out of this revelation, part by part, at last came out the four acts of the gladness, and the one long, and as yet uncatastrophied fifth act of the grief of his life's drama. He was an old man, who, at the age of nearly sixty, had postponedly encountered that thing in sorrow's technicals called ruin. He had been an artisan of famed excellence, and with plenty to do; owned a house and garden; embraced a youthful, daughter-like, loving wife, and three blithe, ruddy children; every Sunday went to a cheerful-looking church, planted in a grove. But one night, under cover of darkness, and further concealed in a most cunning disguisement, a desperate burglar slid into his happy home, and robbed them all of everything. And darker yet to tell, the blacksmith himself did ignorantly conduct this burglar into his family's heart. It was the Bottle Conjuror! Upon the opening of that fatal cork, forth flew the fiend, and shrivelled up his home. Now, for prudent, most wise, and economic reasons, the blacksmith's shop was in the basement of his dwelling, but with a separate entrance to it; so that always had the young and loving healthy wife listened with no unhappy nervousness, but with vigorous pleasure, to the stout ringing of her young-armed old husband's hammer; whose reverberations, muffled by passing through the floors and walls, came up to her, not unsweetly, in her nursery; and so, to stout Labor's iron lullaby, the blacksmith's infants were rocked to slumber. Oh, woe on woe! Oh, Death, why canst thou not sometimes be timely? Hadst thou taken this old blacksmith to thyself ere his full ruin came upon him, then had the young widow had a delicious grief, and her orphans a truly venerable, legendary sire to dream of in their after years; and all of them a care-killing competency. -

-
- - \ No newline at end of file diff --git a/tests/test_cli.sh b/tests/test_cli.sh deleted file mode 100755 index 03d6df8..0000000 --- a/tests/test_cli.sh +++ /dev/null @@ -1,175 +0,0 @@ -#!/bin/bash -# Comprehensive CLI Testing Script -# Tests all brightdata CLI commands to validate end-user experience - -set -e # Exit on error - -echo "================================================================================" -echo "COMPREHENSIVE CLI VALIDATION - Testing Real User Experience" -echo "================================================================================" -echo "Timestamp: $(date '+%Y%m%d_%H%M%S')" -echo "================================================================================" - -# Create probe directory structure for CLI tests -PROBE_DIR="probe/cli" -mkdir -p "$PROBE_DIR"/{scrape,search,help,errors} - -TIMESTAMP=$(date '+%Y%m%d_%H%M%S') -SUMMARY_FILE="$PROBE_DIR/cli_summary_$TIMESTAMP.txt" - -# Track results -TOTAL_TESTS=0 -PASSED_TESTS=0 -FAILED_TESTS=0 - -# Helper function to run CLI test -run_cli_test() { - local test_name=$1 - local command=$2 - local category=$3 - local output_file="$PROBE_DIR/$category/${test_name}_${TIMESTAMP}.txt" - - TOTAL_TESTS=$((TOTAL_TESTS + 1)) - - echo "" - echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" - echo "TEST: $test_name" - echo "COMMAND: $command" - echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" - - # Run command and save output - if eval "$command" > "$output_file" 2>&1; then - echo " ✅ PASSED" - echo " 📁 Output: $output_file" - PASSED_TESTS=$((PASSED_TESTS + 1)) - return 0 - else - EXIT_CODE=$? - echo " ❌ FAILED (exit code: $EXIT_CODE)" - echo " 📁 Error output: $output_file" - FAILED_TESTS=$((FAILED_TESTS + 1)) - return 1 - fi -} - -# ============================================================================= -# STEP 1: HELP COMMANDS -# ============================================================================= - -echo "" -echo "📋 STEP 1: HELP & INFO COMMANDS" -echo "================================================================================" - -run_cli_test "help_main" "brightdata --help" "help" -run_cli_test "help_scrape" "brightdata scrape --help" "help" -run_cli_test "help_search" "brightdata search --help" "help" -run_cli_test "help_scrape_amazon" "brightdata scrape amazon --help" "help" -run_cli_test "help_search_amazon" "brightdata search amazon --help" "help" -run_cli_test "help_search_linkedin" "brightdata search linkedin --help" "help" - -# ============================================================================= -# STEP 2: SCRAPE COMMANDS (if we have test token - these will fail without real API) -# ============================================================================= - -echo "" -echo "📋 STEP 2: SCRAPE COMMANDS (syntax validation)" -echo "================================================================================" -echo "Note: These test CLI syntax, not actual API calls (would need valid token)" - -# Test CLI syntax validation (will fail on auth but validates parsing) -run_cli_test "scrape_amazon_products_help" \ - "brightdata scrape amazon products --help" \ - "scrape" || true - -run_cli_test "scrape_linkedin_profiles_help" \ - "brightdata scrape linkedin profiles --help" \ - "scrape" || true - -run_cli_test "scrape_facebook_posts_help" \ - "brightdata scrape facebook --help" \ - "scrape" || true - -run_cli_test "scrape_instagram_profiles_help" \ - "brightdata scrape instagram --help" \ - "scrape" || true - -# ============================================================================= -# STEP 3: SEARCH COMMANDS (syntax validation) -# ============================================================================= - -echo "" -echo "📋 STEP 3: SEARCH COMMANDS (syntax validation)" -echo "================================================================================" - -run_cli_test "search_google_help" \ - "brightdata search google --help" \ - "search" || true - -run_cli_test "search_linkedin_jobs_help" \ - "brightdata search linkedin jobs --help" \ - "search" || true - -# ============================================================================= -# STEP 4: FORMAT OPTIONS -# ============================================================================= - -echo "" -echo "📋 STEP 4: OUTPUT FORMAT OPTIONS" -echo "================================================================================" - -# Test that --output-format is recognized -run_cli_test "format_json_help" \ - "brightdata scrape --help | grep 'output-format'" \ - "help" || true - -run_cli_test "format_generic_help" \ - "brightdata scrape generic --help" \ - "help" || true - -# ============================================================================= -# FINAL SUMMARY -# ============================================================================= - -echo "" -echo "================================================================================" -echo "CLI VALIDATION SUMMARY" -echo "================================================================================" - -{ - echo "Timestamp: $(date)" - echo "" - echo "TEST RESULTS:" - echo " Total: $TOTAL_TESTS" - echo " Passed: $PASSED_TESTS" - echo " Failed: $FAILED_TESTS" - echo "" - - if [ $FAILED_TESTS -eq 0 ]; then - echo "✅ ALL CLI TESTS PASSED" - echo "" - echo "CLI is fully functional and ready for users!" - else - echo "⚠️ $FAILED_TESTS test(s) failed" - echo "" - echo "Check probe/cli/ directory for details" - fi - - echo "" - echo "📁 All outputs saved to: probe/cli/" - echo "" - echo "Directory structure:" - find "$PROBE_DIR" -type f | sort - -} | tee "$SUMMARY_FILE" - -echo "" -echo "================================================================================" -if [ $FAILED_TESTS -eq 0 ]; then - echo "🎉 CLI VALIDATION COMPLETE - ALL SYSTEMS GO" - exit 0 -else: - echo "⚠️ SOME CLI TESTS FAILED - CHECK OUTPUTS" - exit 1 -fi -echo "================================================================================" - diff --git a/tests/unit/__init__.py b/tests/unit/__init__.py index e0310a0..e69de29 100644 --- a/tests/unit/__init__.py +++ b/tests/unit/__init__.py @@ -1 +0,0 @@ -"""Unit tests.""" diff --git a/tests/unit/test_amazon.py b/tests/unit/test_amazon.py deleted file mode 100644 index 9a313d0..0000000 --- a/tests/unit/test_amazon.py +++ /dev/null @@ -1,314 +0,0 @@ -"""Unit tests for Amazon scraper.""" - -from brightdata import BrightDataClient -from brightdata.scrapers.amazon import AmazonScraper - - -class TestAmazonScraperURLBased: - """Test Amazon scraper (URL-based extraction).""" - - def test_amazon_scraper_has_products_method(self): - """Test Amazon scraper has products method (async-first API).""" - scraper = AmazonScraper(bearer_token="test_token_123456789") - - assert hasattr(scraper, "products") - assert callable(scraper.products) - - def test_amazon_scraper_has_reviews_method(self): - """Test Amazon scraper has reviews method (async-first API).""" - scraper = AmazonScraper(bearer_token="test_token_123456789") - - assert hasattr(scraper, "reviews") - assert callable(scraper.reviews) - - def test_amazon_scraper_has_sellers_method(self): - """Test Amazon scraper has sellers method (async-first API).""" - scraper = AmazonScraper(bearer_token="test_token_123456789") - - assert hasattr(scraper, "sellers") - assert callable(scraper.sellers) - - def test_products_method_signature(self): - """Test products method has correct signature.""" - import inspect - - scraper = AmazonScraper(bearer_token="test_token_123456789") - sig = inspect.signature(scraper.products) - - # Required: url parameter - assert "url" in sig.parameters - - # Optional: sync and timeout - assert "sync" not in sig.parameters - assert "timeout" in sig.parameters - - # Defaults - assert sig.parameters["timeout"].default == 240 - - def test_reviews_method_signature(self): - """Test reviews method has correct signature.""" - import inspect - - scraper = AmazonScraper(bearer_token="test_token_123456789") - sig = inspect.signature(scraper.reviews) - - # Required: url - assert "url" in sig.parameters - - # Optional filters - assert "pastDays" in sig.parameters - assert "keyWord" in sig.parameters - assert "numOfReviews" in sig.parameters - assert "sync" not in sig.parameters - assert "timeout" in sig.parameters - - # Defaults - assert sig.parameters["timeout"].default == 240 - - def test_sellers_method_signature(self): - """Test sellers method has correct signature.""" - import inspect - - scraper = AmazonScraper(bearer_token="test_token_123456789") - sig = inspect.signature(scraper.sellers) - - assert "url" in sig.parameters - assert "sync" not in sig.parameters - assert "timeout" in sig.parameters - assert sig.parameters["timeout"].default == 240 - - -class TestAmazonDatasetIDs: - """Test Amazon has correct dataset IDs.""" - - def test_scraper_has_all_dataset_ids(self): - """Test scraper has dataset IDs for all types.""" - scraper = AmazonScraper(bearer_token="test_token_123456789") - - assert scraper.DATASET_ID # Products - assert scraper.DATASET_ID_REVIEWS - assert scraper.DATASET_ID_SELLERS - - # All should start with gd_ - assert scraper.DATASET_ID.startswith("gd_") - assert scraper.DATASET_ID_REVIEWS.startswith("gd_") - assert scraper.DATASET_ID_SELLERS.startswith("gd_") - - def test_dataset_ids_are_correct(self): - """Test dataset IDs match Bright Data identifiers.""" - scraper = AmazonScraper(bearer_token="test_token_123456789") - - # Verify known IDs - assert scraper.DATASET_ID == "gd_l7q7dkf244hwjntr0" # Products - assert scraper.DATASET_ID_REVIEWS == "gd_le8e811kzy4ggddlq" # Reviews - assert scraper.DATASET_ID_SELLERS == "gd_lhotzucw1etoe5iw1k" # Sellers - - -class TestAmazonSyncVsAsyncMode: - """Test sync vs async mode handling.""" - - def test_default_timeout_is_correct(self): - """Test default timeout is 240s for async workflow.""" - import inspect - - scraper = AmazonScraper(bearer_token="test_token_123456789") - sig = inspect.signature(scraper.products) - - assert sig.parameters["timeout"].default == 240 - - def test_all_methods_dont_have_sync_parameter(self): - """Test all scrape methods don't have sync parameter (standard async pattern).""" - import inspect - - scraper = AmazonScraper(bearer_token="test_token_123456789") - - for method_name in ["products", "reviews", "sellers"]: - sig = inspect.signature(getattr(scraper, method_name)) - assert "sync" not in sig.parameters - - -class TestAmazonAPISpecCompliance: - """Test compliance with exact API specifications.""" - - def test_products_api_spec(self): - """Test products() matches CP API spec.""" - client = BrightDataClient(token="test_token_123456789") - - # API Spec: client.scrape.amazon.products(url, timeout=240) - import inspect - - sig = inspect.signature(client.scrape.amazon.products) - - assert "url" in sig.parameters - assert "sync" not in sig.parameters - assert "timeout" in sig.parameters - assert sig.parameters["timeout"].default == 240 - - def test_reviews_api_spec(self): - """Test reviews() matches CP API spec.""" - client = BrightDataClient(token="test_token_123456789") - - # API Spec: reviews(url, pastDays, keyWord, numOfReviews, sync, timeout) - import inspect - - sig = inspect.signature(client.scrape.amazon.reviews) - - params = sig.parameters - assert "url" in params - assert "pastDays" in params - assert "keyWord" in params - assert "numOfReviews" in params - assert "sync" not in params - assert "timeout" in params - - def test_sellers_api_spec(self): - """Test sellers() matches CP API spec.""" - client = BrightDataClient(token="test_token_123456789") - - # API Spec: sellers(url, timeout=240) - import inspect - - sig = inspect.signature(client.scrape.amazon.sellers) - - assert "url" in sig.parameters - assert "sync" not in sig.parameters - assert "timeout" in sig.parameters - - -class TestAmazonParameterArraySupport: - """Test array parameter support (str | array).""" - - def test_url_accepts_string(self): - """Test url parameter accepts single string.""" - import inspect - - scraper = AmazonScraper(bearer_token="test_token_123456789") - sig = inspect.signature(scraper.products) - - # Type annotation should allow str | List[str] - url_annotation = str(sig.parameters["url"].annotation) - assert "Union" in url_annotation or "|" in url_annotation - assert "str" in url_annotation - - def test_url_accepts_list(self): - """Test url parameter accepts list.""" - import inspect - - scraper = AmazonScraper(bearer_token="test_token_123456789") - sig = inspect.signature(scraper.products) - - url_annotation = str(sig.parameters["url"].annotation) - assert "List" in url_annotation or "list" in url_annotation - - -class TestAmazonAsyncFirstAPI: - """Test all methods follow async-first pattern.""" - - def test_all_methods_exist(self): - """Test all methods exist (async-first API, no _async suffix).""" - scraper = AmazonScraper(bearer_token="test_token_123456789") - - methods = ["products", "reviews", "sellers"] - - for method in methods: - assert hasattr(scraper, method) - assert callable(getattr(scraper, method)) - - -class TestAmazonClientIntegration: - """Test Amazon integrates properly with client.""" - - def test_amazon_accessible_via_client(self): - """Test Amazon scraper accessible via client.scrape.amazon.""" - client = BrightDataClient(token="test_token_123456789") - - amazon = client.scrape.amazon - assert amazon is not None - assert isinstance(amazon, AmazonScraper) - - def test_client_passes_token_to_scraper(self): - """Test client passes token to Amazon scraper.""" - token = "test_token_123456789" - client = BrightDataClient(token=token) - - amazon = client.scrape.amazon - assert amazon.bearer_token == token - - def test_all_amazon_methods_accessible_through_client(self): - """Test all Amazon methods accessible through client.""" - client = BrightDataClient(token="test_token_123456789") - - amazon = client.scrape.amazon - - assert callable(amazon.products) - assert callable(amazon.reviews) - assert callable(amazon.sellers) - - -class TestAmazonReviewsFilters: - """Test Amazon reviews method filters.""" - - def test_reviews_accepts_pastDays_filter(self): - """Test reviews method accepts pastDays parameter.""" - import inspect - - scraper = AmazonScraper(bearer_token="test_token_123456789") - sig = inspect.signature(scraper.reviews) - - assert "pastDays" in sig.parameters - assert sig.parameters["pastDays"].default is None # Optional - - def test_reviews_accepts_keyWord_filter(self): - """Test reviews method accepts keyWord parameter.""" - import inspect - - scraper = AmazonScraper(bearer_token="test_token_123456789") - sig = inspect.signature(scraper.reviews) - - assert "keyWord" in sig.parameters - assert sig.parameters["keyWord"].default is None - - def test_reviews_accepts_numOfReviews_filter(self): - """Test reviews method accepts numOfReviews parameter.""" - import inspect - - scraper = AmazonScraper(bearer_token="test_token_123456789") - sig = inspect.signature(scraper.reviews) - - assert "numOfReviews" in sig.parameters - assert sig.parameters["numOfReviews"].default is None - - -class TestAmazonPhilosophicalPrinciples: - """Test Amazon scraper follows philosophical principles.""" - - def test_consistent_timeout_defaults(self): - """Test consistent timeout defaults across methods.""" - scraper = AmazonScraper(bearer_token="test_token_123456789") - - import inspect - - # All methods should default to 240s - for method_name in ["products", "reviews", "sellers"]: - sig = inspect.signature(getattr(scraper, method_name)) - assert sig.parameters["timeout"].default == 240 - - def test_uses_standard_async_workflow(self): - """Test methods use standard async workflow (no sync parameter).""" - scraper = AmazonScraper(bearer_token="test_token_123456789") - - import inspect - - for method_name in ["products", "reviews", "sellers"]: - sig = inspect.signature(getattr(scraper, method_name)) - - # Should not have sync parameter - assert "sync" not in sig.parameters - - def test_amazon_is_platform_expert(self): - """Test Amazon scraper knows its platform.""" - scraper = AmazonScraper(bearer_token="test_token_123456789") - - assert scraper.PLATFORM_NAME == "amazon" - assert scraper.DATASET_ID # Has dataset knowledge - assert scraper.MIN_POLL_TIMEOUT == 240 # Knows Amazon takes longer diff --git a/tests/unit/test_async_unblocker.py b/tests/unit/test_async_unblocker.py index 6595397..bc75ed1 100644 --- a/tests/unit/test_async_unblocker.py +++ b/tests/unit/test_async_unblocker.py @@ -1,190 +1,174 @@ -"""Unit tests for AsyncUnblockerClient.""" +"""Tests for web_unlocker/async_client.py — Trigger, status, and fetch operations.""" import pytest from unittest.mock import AsyncMock, MagicMock -from brightdata.api.async_unblocker import AsyncUnblockerClient + +from brightdata.web_unlocker.async_client import AsyncUnblockerClient from brightdata.exceptions import APIError +from tests.conftest import MockContextManager + -class MockAsyncContextManager: - """Helper to mock async context managers.""" +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- - def __init__(self, return_value): - self.return_value = return_value - async def __aenter__(self): - return self.return_value +@pytest.fixture +def engine(): + eng = MagicMock() + eng.BASE_URL = "https://api.brightdata.com" + return eng - async def __aexit__(self, exc_type, exc_val, exc_tb): - pass +@pytest.fixture +def client(engine): + return AsyncUnblockerClient(engine) -class TestAsyncUnblockerClient: - """Test AsyncUnblockerClient functionality.""" - def setup_method(self): - """Set up test fixtures.""" - self.engine = MagicMock() - self.engine.BASE_URL = "https://api.brightdata.com" - self.client = AsyncUnblockerClient(self.engine) +# --------------------------------------------------------------------------- +# Trigger +# --------------------------------------------------------------------------- + +class TestTrigger: @pytest.mark.asyncio - async def test_trigger_success(self): - """Test successful trigger returns response_id from header.""" - # Mock response with x-response-id header + async def test_returns_response_id(self, client, engine): response = MagicMock() response.headers.get.return_value = "test_response_id_123" + engine.post_to_url = MagicMock(return_value=MockContextManager(response)) - # Mock post_to_url to return async context manager - self.engine.post_to_url = MagicMock(return_value=MockAsyncContextManager(response)) - - # Trigger request - response_id = await self.client.trigger(zone="test_zone", url="https://example.com") + response_id = await client.trigger(zone="test_zone", url="https://example.com") - # Verify response_id returned assert response_id == "test_response_id_123" - - # Verify correct endpoint called - self.engine.post_to_url.assert_called_once() - call_args = self.engine.post_to_url.call_args + engine.post_to_url.assert_called_once() + call_args = engine.post_to_url.call_args assert call_args[0][0] == "https://api.brightdata.com/unblocker/req" assert call_args[1]["params"] == {"zone": "test_zone"} assert call_args[1]["json_data"]["url"] == "https://example.com" @pytest.mark.asyncio - async def test_trigger_with_additional_params(self): - """Test trigger passes additional parameters correctly.""" + async def test_passes_additional_params(self, client, engine): response = MagicMock() response.headers.get.return_value = "response_id_456" + engine.post_to_url = MagicMock(return_value=MockContextManager(response)) - self.engine.post_to_url = MagicMock(return_value=MockAsyncContextManager(response)) - - # Trigger with additional params - response_id = await self.client.trigger( + response_id = await client.trigger( zone="my_zone", url="https://google.com/search?q=test", format="raw", country="US" ) assert response_id == "response_id_456" - - # Verify params merged into payload - call_args = self.engine.post_to_url.call_args - payload = call_args[1]["json_data"] + payload = engine.post_to_url.call_args[1]["json_data"] assert payload["url"] == "https://google.com/search?q=test" assert payload["format"] == "raw" assert payload["country"] == "US" @pytest.mark.asyncio - async def test_trigger_no_response_id(self): - """Test trigger returns None when no response_id header.""" + async def test_returns_none_when_no_response_id(self, client, engine): response = MagicMock() - response.headers.get.return_value = None # No x-response-id + response.headers.get.return_value = None + engine.post_to_url = MagicMock(return_value=MockContextManager(response)) - self.engine.post_to_url = MagicMock(return_value=MockAsyncContextManager(response)) + response_id = await client.trigger(zone="test_zone", url="https://example.com") + assert response_id is None - response_id = await self.client.trigger(zone="test_zone", url="https://example.com") - assert response_id is None +# --------------------------------------------------------------------------- +# Get Status +# --------------------------------------------------------------------------- + +class TestGetStatus: @pytest.mark.asyncio - async def test_get_status_ready(self): - """Test get_status returns 'ready' for HTTP 200.""" + async def test_200_returns_ready(self, client, engine): response = MagicMock() response.status = 200 + engine.get_from_url = MagicMock(return_value=MockContextManager(response)) - self.engine.get_from_url = MagicMock(return_value=MockAsyncContextManager(response)) - - status = await self.client.get_status(zone="test_zone", response_id="abc123") + status = await client.get_status(zone="test_zone", response_id="abc123") assert status == "ready" - - # Verify correct endpoint and params - call_args = self.engine.get_from_url.call_args + call_args = engine.get_from_url.call_args assert call_args[0][0] == "https://api.brightdata.com/unblocker/get_result" assert call_args[1]["params"]["zone"] == "test_zone" assert call_args[1]["params"]["response_id"] == "abc123" @pytest.mark.asyncio - async def test_get_status_pending(self): - """Test get_status returns 'pending' for HTTP 202.""" + async def test_202_returns_pending(self, client, engine): response = MagicMock() response.status = 202 + engine.get_from_url = MagicMock(return_value=MockContextManager(response)) - self.engine.get_from_url = MagicMock(return_value=MockAsyncContextManager(response)) - - status = await self.client.get_status(zone="test_zone", response_id="xyz789") - + status = await client.get_status(zone="test_zone", response_id="xyz789") assert status == "pending" @pytest.mark.asyncio - async def test_get_status_error(self): - """Test get_status returns 'error' for non-200/202 status.""" - # Test various error codes + async def test_error_codes_return_error(self, client, engine): for error_code in [400, 404, 500, 503]: response = MagicMock() response.status = error_code + engine.get_from_url = MagicMock(return_value=MockContextManager(response)) - self.engine.get_from_url = MagicMock(return_value=MockAsyncContextManager(response)) + status = await client.get_status(zone="test_zone", response_id="err123") + assert status == "error", f"Expected 'error' for HTTP {error_code}" - status = await self.client.get_status(zone="test_zone", response_id="err123") - assert status == "error", f"Expected 'error' for HTTP {error_code}" +# --------------------------------------------------------------------------- +# Fetch Result +# --------------------------------------------------------------------------- + +class TestFetchResult: @pytest.mark.asyncio - async def test_fetch_result_success(self): - """Test fetch_result returns parsed JSON for HTTP 200.""" + async def test_200_returns_json(self, client, engine): expected_data = {"general": {"search_engine": "google"}, "organic": [{"title": "Result 1"}]} - response = MagicMock() response.status = 200 response.json = AsyncMock(return_value=expected_data) + engine.get_from_url = MagicMock(return_value=MockContextManager(response)) - self.engine.get_from_url = MagicMock(return_value=MockAsyncContextManager(response)) - - data = await self.client.fetch_result(zone="test_zone", response_id="fetch123") + data = await client.fetch_result(zone="test_zone", response_id="fetch123") assert data == expected_data response.json.assert_called_once() @pytest.mark.asyncio - async def test_fetch_result_not_ready(self): - """Test fetch_result raises APIError for HTTP 202 (pending).""" + async def test_202_raises_api_error(self, client, engine): response = MagicMock() response.status = 202 - - self.engine.get_from_url = MagicMock(return_value=MockAsyncContextManager(response)) + engine.get_from_url = MagicMock(return_value=MockContextManager(response)) with pytest.raises(APIError) as exc_info: - await self.client.fetch_result(zone="test_zone", response_id="pending123") + await client.fetch_result(zone="test_zone", response_id="pending123") assert "not ready yet" in str(exc_info.value).lower() assert "202" in str(exc_info.value) @pytest.mark.asyncio - async def test_fetch_result_error(self): - """Test fetch_result raises APIError for error status codes.""" + async def test_500_raises_api_error(self, client, engine): response = MagicMock() response.status = 500 response.text = AsyncMock(return_value="Internal Server Error") - - self.engine.get_from_url = MagicMock(return_value=MockAsyncContextManager(response)) + engine.get_from_url = MagicMock(return_value=MockContextManager(response)) with pytest.raises(APIError) as exc_info: - await self.client.fetch_result(zone="test_zone", response_id="error123") + await client.fetch_result(zone="test_zone", response_id="error123") - error_msg = str(exc_info.value) - assert "500" in error_msg - assert "Internal Server Error" in error_msg + assert "500" in str(exc_info.value) + assert "Internal Server Error" in str(exc_info.value) - @pytest.mark.asyncio - async def test_endpoint_constants(self): - """Test that endpoint constants are correct.""" - assert self.client.TRIGGER_ENDPOINT == "/unblocker/req" - assert self.client.FETCH_ENDPOINT == "/unblocker/get_result" - @pytest.mark.asyncio - async def test_client_initialization(self): - """Test client initializes with AsyncEngine.""" - engine = MagicMock() - client = AsyncUnblockerClient(engine) +# --------------------------------------------------------------------------- +# Constants and init +# --------------------------------------------------------------------------- - assert client.engine is engine + +class TestClientSetup: + def test_endpoint_constants(self, client): + assert client.TRIGGER_ENDPOINT == "/unblocker/req" + assert client.FETCH_ENDPOINT == "/unblocker/get_result" + + def test_stores_engine_reference(self): + engine = MagicMock() + c = AsyncUnblockerClient(engine) + assert c.engine is engine diff --git a/tests/unit/test_batch.py b/tests/unit/test_batch.py deleted file mode 100644 index eef2da9..0000000 --- a/tests/unit/test_batch.py +++ /dev/null @@ -1,172 +0,0 @@ -""" -Tests for batch scraping operations. - -Verifies that scraping multiple URLs returns List[ScrapeResult] correctly. -""" - -from brightdata import BrightDataClient - - -class TestBatchOperations: - """Test batch scraping returns correct types.""" - - def test_single_url_returns_single_result(self): - """Test that a single URL returns ScrapeResult (not list).""" - client = BrightDataClient(token="test_token_123456789") - - # Verify single URL behavior - scraper = client.scrape.amazon - - # Single URL should return ScrapeResult - import inspect - - sig = inspect.signature(scraper.products) - - # Should accept Union[str, List[str]] - params = sig.parameters - assert "url" in params - - def test_list_with_one_url_returns_single_result(self): - """Test that list with 1 URL returns unwrapped ScrapeResult.""" - # This is the expected behavior - list with 1 item gets unwrapped - # This test documents the API contract - pass - - def test_multiple_urls_should_return_list(self): - """Test that multiple URLs should return List[ScrapeResult].""" - # This documents that the API SHOULD return a list of results - # when given multiple URLs, not a single result with data as list - - # Expected behavior: - # Input: ["url1", "url2", "url3"] - # Output: [ScrapeResult, ScrapeResult, ScrapeResult] - # NOT: ScrapeResult with data=[item1, item2, item3] - pass - - def test_batch_result_type_annotations(self): - """Test that method signatures indicate Union[ScrapeResult, List[ScrapeResult]].""" - from brightdata.scrapers.amazon import AmazonScraper - - scraper = AmazonScraper(bearer_token="test_token_123456789") - - import inspect - - sig = inspect.signature(scraper.products) - - # Check return type annotation - return_type = sig.return_annotation - assert return_type != inspect.Signature.empty, "Should have return type annotation" - - # Should be Union[ScrapeResult, List[ScrapeResult]] - type_str = str(return_type) - assert "ScrapeResult" in type_str - assert "List" in type_str or "Union" in type_str - - -class TestBatchScrapingBehavior: - """Test actual batch scraping behavior.""" - - def test_batch_operations_contract(self): - """Document the batch operations API contract.""" - # API Contract: - # 1. Single URL string → ScrapeResult - # 2. List with 1 URL → ScrapeResult (unwrapped for convenience) - # 3. List with 2+ URLs → List[ScrapeResult] (one per URL) - - # This ensures each URL gets its own result object with: - # - Individual success/error status - # - Individual timing information - # - Individual cost tracking - # - Individual data payload - pass - - def test_batch_result_independence(self): - """Test that batch results are independent.""" - # Each result in a batch should be independent: - # - If URL 1 fails, URL 2 should still have its own result - # - Each result has its own cost calculation - # - Each result has its own timing data - # - Each result has its own url field set - pass - - -class TestBatchErrorHandling: - """Test batch operations error handling.""" - - def test_batch_with_mixed_success_failure(self): - """Test batch operations with some URLs succeeding and some failing.""" - # Expected: Each URL gets its own ScrapeResult - # Some have success=True, some have success=False - # All are in the returned list - pass - - def test_batch_cost_calculation(self): - """Test that costs are divided among batch results.""" - # If total cost is $0.003 for 3 URLs - # Each result should have cost=$0.001 - pass - - -class TestBatchImplementationAllPlatforms: - """Verify batch fix is implemented across ALL platforms.""" - - def test_amazon_has_batch_logic(self): - """Verify Amazon scraper has batch transformation logic.""" - import inspect - from brightdata.scrapers.amazon import AmazonScraper - - source = inspect.getsource(AmazonScraper) - - # Should have the batch transformation code - assert "elif not is_single and isinstance(result.data, list):" in source - assert "for url_item, data_item in zip" in source - assert "List[ScrapeResult]" in source or "results.append" in source - - def test_linkedin_has_batch_logic(self): - """Verify LinkedIn scraper has batch transformation logic.""" - import inspect - from brightdata.scrapers.linkedin import LinkedInScraper - - source = inspect.getsource(LinkedInScraper) - - assert "elif not is_single and isinstance(result.data, list):" in source - assert "for url_item, data_item in zip" in source - - def test_instagram_has_batch_logic(self): - """Verify Instagram scraper has batch transformation logic.""" - import inspect - from brightdata.scrapers.instagram import InstagramScraper - - source = inspect.getsource(InstagramScraper) - - assert "elif not is_single and isinstance(result.data, list):" in source - assert "for url_item, data_item in zip" in source - - def test_facebook_has_batch_logic(self): - """Verify Facebook scraper has batch transformation logic.""" - import inspect - from brightdata.scrapers.facebook import FacebookScraper - - source = inspect.getsource(FacebookScraper) - - assert "elif not is_single and isinstance(result.data, list):" in source - assert "for url_item, data_item in zip" in source - - -class TestBatchBugRegression: - """Ensure the batch bug doesn't regress.""" - - def test_batch_returns_list_not_single_result_with_list_data(self): - """THE KEY TEST: Batch operations must return List[ScrapeResult], not ScrapeResult with list data.""" - # This is the core issue from issues.md - # - # BEFORE (BUG): - # Input: ["url1", "url2"] - # Output: ScrapeResult(data=[item1, item2]) ❌ WRONG - # - # AFTER (FIXED): - # Input: ["url1", "url2"] - # Output: [ScrapeResult(data=item1), ScrapeResult(data=item2)] ✅ CORRECT - # - # The fix ensures each URL gets its own ScrapeResult object - assert True # Implementation verified by code inspection tests above diff --git a/tests/unit/test_chatgpt.py b/tests/unit/test_chatgpt.py deleted file mode 100644 index 9fd9e2a..0000000 --- a/tests/unit/test_chatgpt.py +++ /dev/null @@ -1,265 +0,0 @@ -"""Unit tests for ChatGPT search service.""" - -import inspect -from brightdata import BrightDataClient -from brightdata.scrapers.chatgpt import ChatGPTSearchService - - -class TestChatGPTSearchService: - """Test ChatGPT search service.""" - - def test_chatgpt_search_has_chatGPT_method(self): - """Test ChatGPT search has chatGPT method (async-first API).""" - search = ChatGPTSearchService(bearer_token="test_token_123456789") - - assert hasattr(search, "chatGPT") - assert callable(search.chatGPT) - - def test_chatGPT_method_signature(self): - """Test chatGPT method has correct signature.""" - import inspect - - search = ChatGPTSearchService(bearer_token="test_token_123456789") - sig = inspect.signature(search.chatGPT) - - # Required: prompt - assert "prompt" in sig.parameters - - # Optional parameters - assert "country" in sig.parameters - assert "secondaryPrompt" in sig.parameters - assert "webSearch" in sig.parameters - assert "sync" not in sig.parameters - assert "timeout" in sig.parameters - - # Defaults - assert sig.parameters["timeout"].default == 180 - - def test_chatGPT_validates_required_prompt(self): - """Test chatGPT raises error if prompt is missing.""" - search = ChatGPTSearchService(bearer_token="test_token_123456789") - - # This would fail at runtime, but we test the validation exists - # (Can't actually call without mocking the engine) - assert "prompt" in str(inspect.signature(search.chatGPT).parameters) - - -class TestChatGPTAPISpecCompliance: - """Test compliance with exact API specifications.""" - - def test_api_spec_matches_cp_link(self): - """Test method matches CP link specification.""" - client = BrightDataClient(token="test_token_123456789") - - # API Spec: client.search.chatGPT(prompt, country, secondaryPrompt, webSearch, timeout) - import inspect - - sig = inspect.signature(client.search.chatGPT.chatGPT) - - params = sig.parameters - - # All parameters from spec - assert "prompt" in params # str | array, required - assert "country" in params # str | array, 2-letter format - assert "secondaryPrompt" in params # str | array - assert "webSearch" in params # bool | array - assert "sync" not in params # Removed - uses standard async workflow - assert "timeout" in params # int, default: 180 - - def test_parameter_defaults_match_spec(self): - """Test parameter defaults match specification.""" - import inspect - - search = ChatGPTSearchService(bearer_token="test_token_123456789") - sig = inspect.signature(search.chatGPT) - - # Defaults per spec - assert sig.parameters["timeout"].default == 180 - - # Optional params should default to None - assert sig.parameters["country"].default is None - assert sig.parameters["secondaryPrompt"].default is None - assert sig.parameters["webSearch"].default is None - - -class TestChatGPTParameterArraySupport: - """Test array parameter support (str | array, bool | array).""" - - def test_prompt_accepts_string(self): - """Test prompt parameter accepts single string.""" - import inspect - - search = ChatGPTSearchService(bearer_token="test_token_123456789") - sig = inspect.signature(search.chatGPT) - - # Type annotation should allow str | List[str] - prompt_annotation = str(sig.parameters["prompt"].annotation) - assert "Union" in prompt_annotation or "str" in prompt_annotation - - def test_prompt_accepts_list(self): - """Test prompt parameter accepts list.""" - import inspect - - search = ChatGPTSearchService(bearer_token="test_token_123456789") - sig = inspect.signature(search.chatGPT) - - prompt_annotation = str(sig.parameters["prompt"].annotation) - assert "List" in prompt_annotation or "list" in prompt_annotation - - def test_country_accepts_string_or_list(self): - """Test country accepts str | list.""" - import inspect - - search = ChatGPTSearchService(bearer_token="test_token_123456789") - sig = inspect.signature(search.chatGPT) - - annotation = str(sig.parameters["country"].annotation) - # Should be Optional[Union[str, List[str]]] - assert "str" in annotation - - def test_webSearch_accepts_bool_or_list(self): - """Test webSearch accepts bool | list[bool].""" - import inspect - - search = ChatGPTSearchService(bearer_token="test_token_123456789") - sig = inspect.signature(search.chatGPT) - - annotation = str(sig.parameters["webSearch"].annotation) - # Should accept bool | List[bool] - assert "bool" in annotation - - -class TestChatGPTSyncAsyncMode: - """Test standard async workflow (no sync parameter).""" - - def test_no_sync_parameter(self): - """Test methods don't have sync parameter (standard async pattern).""" - import inspect - - search = ChatGPTSearchService(bearer_token="test_token_123456789") - sig = inspect.signature(search.chatGPT) - - assert "sync" not in sig.parameters - - def test_timeout_defaults_to_180(self): - """Test timeout defaults to 180.""" - import inspect - - search = ChatGPTSearchService(bearer_token="test_token_123456789") - sig = inspect.signature(search.chatGPT) - - assert sig.parameters["timeout"].default == 180 - - def test_has_chatGPT_method(self): - """Test has chatGPT method (async-first API).""" - search = ChatGPTSearchService(bearer_token="test_token_123456789") - - assert hasattr(search, "chatGPT") - assert callable(search.chatGPT) - - -class TestChatGPTClientIntegration: - """Test ChatGPT search integrates with client.""" - - def test_chatgpt_accessible_via_client_search(self): - """Test ChatGPT search accessible via client.search.chatGPT.""" - client = BrightDataClient(token="test_token_123456789") - - chatgpt = client.search.chatGPT - assert chatgpt is not None - assert isinstance(chatgpt, ChatGPTSearchService) - - def test_client_passes_token_to_chatgpt_search(self): - """Test client passes token to ChatGPT search.""" - token = "test_token_123456789" - client = BrightDataClient(token=token) - - chatgpt = client.search.chatGPT - assert chatgpt.bearer_token == token - - def test_chatGPT_method_callable_through_client(self): - """Test chatGPT method callable through client (async-first API).""" - client = BrightDataClient(token="test_token_123456789") - - # Should be able to access the method - assert callable(client.search.chatGPT.chatGPT) - - -class TestChatGPTInterfaceExamples: - """Test interface examples from specification.""" - - def test_single_prompt_interface(self): - """Test single prompt interface.""" - client = BrightDataClient(token="test_token_123456789") - - # Interface should accept single prompt - import inspect - - sig = inspect.signature(client.search.chatGPT.chatGPT) - - # Can call with just prompt - assert "prompt" in sig.parameters - - # Other params are optional - assert sig.parameters["country"].default is None - assert sig.parameters["secondaryPrompt"].default is None - assert sig.parameters["webSearch"].default is None - - def test_batch_prompts_interface(self): - """Test batch prompts interface.""" - client = BrightDataClient(token="test_token_123456789") - - # Should accept lists for all parameters - import inspect - - sig = inspect.signature(client.search.chatGPT.chatGPT) - - # All array parameters should be in Union with List - prompt_annotation = str(sig.parameters["prompt"].annotation) - assert "List" in prompt_annotation - - -class TestChatGPTCountryValidation: - """Test country code validation.""" - - def test_country_should_be_2_letter_format(self): - """Test country parameter expects 2-letter format.""" - # This is validated in the implementation - # We verify the docstring mentions it - search = ChatGPTSearchService(bearer_token="test_token_123456789") - - # Check docstring mentions 2-letter format (async-first API) - doc = search.chatGPT.__doc__ - assert doc is not None and ( - "2-letter" in doc or "2 letter" in doc.replace("-", " ") or "country" in doc.lower() - ) - - -class TestChatGPTPhilosophicalPrinciples: - """Test ChatGPT search follows philosophical principles.""" - - def test_fixed_url_per_spec(self): - """Test URL is fixed to chatgpt.com per spec.""" - # Per spec comment: "the param URL will be fixed to https://chatgpt.com" - # This is handled in the implementation - search = ChatGPTSearchService(bearer_token="test_token_123456789") - - # Verify implementation exists (can't test without API call) - assert search.DATASET_ID == "gd_m7aof0k82r803d5bjm" - - def test_consistent_with_other_search_services(self): - """Test ChatGPT search follows same patterns as other search services.""" - import inspect - - search = ChatGPTSearchService(bearer_token="test_token_123456789") - - # Should have chatGPT method (async-first API) - assert hasattr(search, "chatGPT") - assert callable(search.chatGPT) - - # Should have timeout parameter - sig = inspect.signature(search.chatGPT) - assert "timeout" in sig.parameters - - # Should not have sync parameter (standard async pattern) - assert "sync" not in sig.parameters diff --git a/tests/unit/test_client.py b/tests/unit/test_client.py index 771f834..0d72ea9 100644 --- a/tests/unit/test_client.py +++ b/tests/unit/test_client.py @@ -1,230 +1,423 @@ -"""Unit tests for BrightDataClient.""" +"""Tests for client.py — BrightDataClient init, services, context manager, API methods.""" -import os import pytest -from unittest.mock import patch -from brightdata import BrightDataClient -from brightdata.exceptions import ValidationError +from unittest.mock import AsyncMock, MagicMock, patch +from brightdata.client import BrightDataClient +from brightdata.exceptions import ValidationError, AuthenticationError, APIError +from brightdata.scrapers.service import ScrapeService +from brightdata.serp.service import SearchService +from brightdata.crawler.service import CrawlerService +from brightdata.datasets import DatasetsClient -class TestClientInitialization: - """Test client initialization and configuration.""" +from tests.conftest import MockResponse, MockContextManager - def test_client_with_explicit_token(self): - """Test client initialization with explicit token.""" - client = BrightDataClient(token="test_token_123456789") - assert client.token == "test_token_123456789" - assert client.timeout == 30 # Default timeout - assert client.web_unlocker_zone == "sdk_unlocker" - assert client.serp_zone == "sdk_serp" +# --------------------------------------------------------------------------- +# Token loading +# --------------------------------------------------------------------------- - def test_client_with_custom_config(self): - """Test client with custom configuration.""" - client = BrightDataClient( - token="custom_token_123456789", - timeout=60, - web_unlocker_zone="my_unlocker", - serp_zone="my_serp", - ) - assert client.timeout == 60 - assert client.web_unlocker_zone == "my_unlocker" - assert client.serp_zone == "my_serp" - - def test_client_loads_from_brightdata_api_token(self): - """Test client loads token from BRIGHTDATA_API_TOKEN.""" - with patch.dict(os.environ, {"BRIGHTDATA_API_TOKEN": "env_token_123456789"}): - client = BrightDataClient() - assert client.token == "env_token_123456789" - - def test_client_prioritizes_explicit_token_over_env(self): - """Test explicit token takes precedence over environment.""" - with patch.dict(os.environ, {"BRIGHTDATA_API_TOKEN": "env_token_123456789"}): - client = BrightDataClient(token="explicit_token_123456789") - assert client.token == "explicit_token_123456789" - - def test_client_raises_error_without_token(self): - """Test client raises ValidationError when no token provided.""" - with patch.dict(os.environ, {}, clear=True): - with pytest.raises(ValidationError) as exc_info: - BrightDataClient() +class TestTokenLoading: + def test_accepts_explicit_token(self): + c = BrightDataClient(token="tok_1234567890") + assert c.token == "tok_1234567890" + + def test_strips_whitespace(self): + c = BrightDataClient(token=" tok_1234567890 ") + assert c.token == "tok_1234567890" - assert "API token required" in str(exc_info.value) - assert "BRIGHTDATA_API_TOKEN" in str(exc_info.value) + @patch.dict("os.environ", {"BRIGHTDATA_API_TOKEN": "env_token_12345"}) + def test_reads_from_env(self): + c = BrightDataClient() + assert c.token == "env_token_12345" - def test_client_raises_error_for_invalid_token_format(self): - """Test client raises ValidationError for invalid token format.""" - with pytest.raises(ValidationError) as exc_info: + def test_raises_without_token(self): + with patch.dict("os.environ", {}, clear=True): + with pytest.raises(ValidationError, match="API token required"): + BrightDataClient() + + def test_rejects_short_token(self): + with pytest.raises(ValidationError, match="at least 10 characters"): BrightDataClient(token="short") - assert "Invalid token format" in str(exc_info.value) + def test_rejects_non_string_token(self): + with pytest.raises(ValidationError): + BrightDataClient(token=12345678901) # type: ignore - def test_client_raises_error_for_non_string_token(self): - """Test client raises ValidationError for non-string token.""" - with pytest.raises(ValidationError) as exc_info: - BrightDataClient(token=12345) + def test_explicit_token_takes_precedence(self): + with patch.dict("os.environ", {"BRIGHTDATA_API_TOKEN": "env_token_12345"}): + c = BrightDataClient(token="explicit_token_12345") + assert c.token == "explicit_token_12345" - assert "Invalid token format" in str(exc_info.value) +# --------------------------------------------------------------------------- +# Init configuration +# --------------------------------------------------------------------------- -class TestClientTokenManagement: - """Test token management and validation.""" - def test_token_is_stripped(self): - """Test token whitespace is stripped.""" - client = BrightDataClient(token=" token_with_spaces_123 ") - assert client.token == "token_with_spaces_123" +class TestInitConfig: + def test_default_timeout(self): + c = BrightDataClient(token="tok_1234567890") + assert c.timeout == 30 - def test_env_token_is_stripped(self): - """Test environment token whitespace is stripped.""" - with patch.dict(os.environ, {"BRIGHTDATA_API_TOKEN": " env_token_123456789 "}): - client = BrightDataClient() - assert client.token == "env_token_123456789" + def test_custom_timeout(self): + c = BrightDataClient(token="tok_1234567890", timeout=120) + assert c.timeout == 120 + def test_default_zone_names(self): + c = BrightDataClient(token="tok_1234567890") + assert c.web_unlocker_zone == "sdk_unlocker" + assert c.serp_zone == "sdk_serp" -class TestClientServiceProperties: - """Test hierarchical service access properties.""" + def test_custom_zone_names(self): + c = BrightDataClient( + token="tok_1234567890", + web_unlocker_zone="my_unlocker", + serp_zone="my_serp", + ) + assert c.web_unlocker_zone == "my_unlocker" + assert c.serp_zone == "my_serp" - def test_scrape_service_property(self): - """Test scrape service property returns ScrapeService.""" - client = BrightDataClient(token="test_token_123456789") + def test_creates_engine(self): + c = BrightDataClient(token="tok_1234567890") + assert c.engine is not None + assert c.engine.bearer_token == "tok_1234567890" - scrape_service = client.scrape - assert scrape_service is not None + def test_services_none_before_access(self): + c = BrightDataClient(token="tok_1234567890") + assert c._scrape_service is None + assert c._search_service is None + assert c._crawler_service is None + assert c._datasets_client is None - # All scrapers should now work - assert scrape_service.amazon is not None - assert scrape_service.linkedin is not None - assert scrape_service.chatgpt is not None + def test_auto_create_zones_default_true(self): + c = BrightDataClient(token="tok_1234567890") + assert c.auto_create_zones is True - def test_scrape_service_is_cached(self): - """Test scrape service is cached (returns same instance).""" - client = BrightDataClient(token="test_token_123456789") + def test_auto_create_zones_can_disable(self): + c = BrightDataClient(token="tok_1234567890", auto_create_zones=False) + assert c.auto_create_zones is False - service1 = client.scrape - service2 = client.scrape - assert service1 is service2 - def test_search_service_property(self): - """Test search service property returns SearchService.""" - client = BrightDataClient(token="test_token_123456789") +# --------------------------------------------------------------------------- +# Service properties (lazy init) +# --------------------------------------------------------------------------- - search_service = client.search - assert search_service is not None - # All search methods should exist and be callable (async-first API) - assert callable(search_service.google) - assert callable(search_service.bing) - assert callable(search_service.yandex) +class TestServiceProperties: + def test_scrape_returns_scrape_service(self): + c = BrightDataClient(token="tok_1234567890") + s = c.scrape + assert isinstance(s, ScrapeService) - def test_crawler_service_property(self): - """Test crawler service property returns CrawlerService.""" - client = BrightDataClient(token="test_token_123456789") + def test_scrape_returns_same_instance(self): + c = BrightDataClient(token="tok_1234567890") + assert c.scrape is c.scrape - crawler_service = client.crawler - assert crawler_service is not None - assert hasattr(crawler_service, "discover") - assert hasattr(crawler_service, "sitemap") + def test_search_returns_search_service(self): + c = BrightDataClient(token="tok_1234567890") + s = c.search + assert isinstance(s, SearchService) + def test_search_returns_same_instance(self): + c = BrightDataClient(token="tok_1234567890") + assert c.search is c.search -class TestClientBackwardCompatibility: - """Test backward compatibility with old API.""" + def test_crawler_returns_crawler_service(self): + c = BrightDataClient(token="tok_1234567890") + s = c.crawler + assert isinstance(s, CrawlerService) - def test_scrape_url_method_exists(self): - """Test scrape_url method exists for backward compatibility.""" - client = BrightDataClient(token="test_token_123456789") - assert hasattr(client, "scrape_url") + def test_crawler_returns_same_instance(self): + c = BrightDataClient(token="tok_1234567890") + assert c.crawler is c.crawler + def test_datasets_returns_datasets_client(self): + c = BrightDataClient(token="tok_1234567890") + d = c.datasets + assert isinstance(d, DatasetsClient) -class TestClientRepr: - """Test client string representation.""" + def test_datasets_returns_same_instance(self): + c = BrightDataClient(token="tok_1234567890") + assert c.datasets is c.datasets - def test_repr_shows_token_preview(self): - """Test __repr__ shows token preview.""" - client = BrightDataClient(token="1234567890abcdefghij") - repr_str = repr(client) - assert "BrightDataClient" in repr_str - assert "1234567890" in repr_str # First 10 chars - assert "fghij" in repr_str # Last 5 chars - assert "abcde" not in repr_str # Middle should not be shown +# --------------------------------------------------------------------------- +# Browser property +# --------------------------------------------------------------------------- - def test_repr_shows_status(self): - """Test __repr__ shows connection status.""" - client = BrightDataClient(token="test_token_123456789") - repr_str = repr(client) - assert "status" in repr_str.lower() +class TestBrowserProperty: + def test_raises_without_credentials(self): + with patch.dict("os.environ", {}, clear=False): + # Ensure env vars are not set + import os + os.environ.pop("BRIGHTDATA_BROWSERAPI_USERNAME", None) + os.environ.pop("BRIGHTDATA_BROWSERAPI_PASSWORD", None) -class TestClientConfiguration: - """Test client configuration options.""" + c = BrightDataClient(token="tok_1234567890") + with pytest.raises(ValidationError, match="Browser API credentials"): + _ = c.browser + + def test_accepts_explicit_credentials(self): + c = BrightDataClient( + token="tok_1234567890", + browser_username="brd-user", + browser_password="pass123", + ) + b = c.browser + assert b is not None + + @patch.dict( + "os.environ", + { + "BRIGHTDATA_BROWSERAPI_USERNAME": "env-user", + "BRIGHTDATA_BROWSERAPI_PASSWORD": "env-pass", + }, + ) + def test_reads_credentials_from_env(self): + c = BrightDataClient(token="tok_1234567890") + b = c.browser + assert b is not None + + def test_returns_same_instance(self): + c = BrightDataClient( + token="tok_1234567890", + browser_username="brd-user", + browser_password="pass123", + ) + assert c.browser is c.browser - def test_auto_create_zones_default_true(self): - """Test auto_create_zones defaults to True.""" - client = BrightDataClient(token="test_token_123456789") - assert client.auto_create_zones is True - - def test_auto_create_zones_can_be_enabled(self): - """Test auto_create_zones can be enabled.""" - client = BrightDataClient(token="test_token_123456789", auto_create_zones=True) - assert client.auto_create_zones is True - - def test_zones_ensured_flag_starts_false(self): - """Test _zones_ensured flag starts as False.""" - client = BrightDataClient(token="test_token_123456789") - assert client._zones_ensured is False - - def test_zone_manager_starts_as_none(self): - """Test zone manager starts as None.""" - client = BrightDataClient(token="test_token_123456789") - assert client._zone_manager is None - - def test_default_timeout_is_30(self): - """Test default timeout is 30 seconds.""" - client = BrightDataClient(token="test_token_123456789") - assert client.timeout == 30 - - def test_custom_timeout_is_respected(self): - """Test custom timeout is respected.""" - client = BrightDataClient(token="test_token_123456789", timeout=120) - assert client.timeout == 120 - - -class TestClientErrorMessages: - """Test client error messages are clear and helpful.""" - - def test_missing_token_error_is_helpful(self): - """Test missing token error provides helpful guidance.""" - with patch.dict(os.environ, {}, clear=True): - with pytest.raises(ValidationError) as exc_info: - BrightDataClient() - error_msg = str(exc_info.value) - assert "API token required" in error_msg - assert "BrightDataClient(token=" in error_msg - assert "BRIGHTDATA_API_TOKEN" in error_msg - assert "https://brightdata.com" in error_msg +# --------------------------------------------------------------------------- +# _ensure_initialized +# --------------------------------------------------------------------------- - def test_invalid_token_format_error_is_clear(self): - """Test invalid token format error is clear.""" - with pytest.raises(ValidationError) as exc_info: - BrightDataClient(token="bad") - error_msg = str(exc_info.value) - assert "Invalid token format" in error_msg - assert "at least 10 characters" in error_msg +class TestEnsureInitialized: + def test_raises_if_no_session(self): + c = BrightDataClient(token="tok_1234567890") + with pytest.raises(RuntimeError, match="not initialized"): + c._ensure_initialized() -class TestClientContextManager: - """Test client context manager support.""" +# --------------------------------------------------------------------------- +# Context manager +# --------------------------------------------------------------------------- - def test_client_supports_async_context_manager(self): - """Test client supports async context manager protocol.""" - client = BrightDataClient(token="test_token_123456789") - assert hasattr(client, "__aenter__") - assert hasattr(client, "__aexit__") - assert callable(client.__aenter__) - assert callable(client.__aexit__) +class TestContextManager: + @pytest.mark.asyncio + async def test_aenter_creates_session(self): + c = BrightDataClient(token="tok_1234567890", auto_create_zones=False) + async with c: + assert c.engine._session is not None + + @pytest.mark.asyncio + async def test_aexit_closes_session(self): + c = BrightDataClient(token="tok_1234567890", auto_create_zones=False) + async with c: + session = c.engine._session + assert c.engine._session is None + assert session.closed + + @pytest.mark.asyncio + async def test_returns_self(self): + c = BrightDataClient(token="tok_1234567890", auto_create_zones=False) + async with c as client: + assert client is c + + @pytest.mark.asyncio + async def test_validate_token_on_enter_success(self): + c = BrightDataClient( + token="tok_1234567890", + validate_token=True, + auto_create_zones=False, + ) + with patch.object(c, "test_connection", new_callable=AsyncMock, return_value=True): + with patch.object(c, "_ensure_zones", new_callable=AsyncMock): + async with c: + pass # should not raise + + @pytest.mark.asyncio + async def test_validate_token_on_enter_failure(self): + c = BrightDataClient( + token="tok_1234567890", + validate_token=True, + auto_create_zones=False, + ) + with patch.object(c, "test_connection", new_callable=AsyncMock, return_value=False): + with pytest.raises(AuthenticationError, match="Token validation failed"): + async with c: + pass + + +# --------------------------------------------------------------------------- +# test_connection +# --------------------------------------------------------------------------- + + +class TestTestConnection: + @pytest.mark.asyncio + async def test_returns_true_on_200(self): + c = BrightDataClient(token="tok_1234567890", auto_create_zones=False) + async with c: + c.engine.get_from_url = MagicMock( + return_value=MockContextManager(MockResponse(200, json_data=[])) + ) + result = await c.test_connection() + assert result is True + assert c._is_connected is True + + @pytest.mark.asyncio + async def test_returns_false_on_401(self): + c = BrightDataClient(token="tok_1234567890", auto_create_zones=False) + async with c: + c.engine.get_from_url = MagicMock(return_value=MockContextManager(MockResponse(401))) + result = await c.test_connection() + assert result is False + assert c._is_connected is False + + @pytest.mark.asyncio + async def test_returns_false_on_exception(self): + c = BrightDataClient(token="tok_1234567890", auto_create_zones=False) + async with c: + c.engine.get_from_url = MagicMock(side_effect=OSError("Network down")) + result = await c.test_connection() + assert result is False + + +# --------------------------------------------------------------------------- +# get_account_info +# --------------------------------------------------------------------------- + + +class TestGetAccountInfo: + @pytest.mark.asyncio + async def test_returns_account_info(self): + c = BrightDataClient(token="tok_1234567890", auto_create_zones=False) + async with c: + zones = [{"name": "zone1"}, {"name": "zone2"}] + c.engine.get_from_url = MagicMock( + return_value=MockContextManager(MockResponse(200, json_data=zones)) + ) + info = await c.get_account_info() + + assert info["zone_count"] == 2 + assert info["token_valid"] is True + assert len(info["zones"]) == 2 + + @pytest.mark.asyncio + async def test_caches_result(self): + c = BrightDataClient(token="tok_1234567890", auto_create_zones=False) + async with c: + c.engine.get_from_url = MagicMock( + return_value=MockContextManager(MockResponse(200, json_data=[{"name": "z"}])) + ) + info1 = await c.get_account_info() + info2 = await c.get_account_info() + + assert info1 is info2 + # Should only call API once + c.engine.get_from_url.assert_called_once() + + @pytest.mark.asyncio + async def test_refresh_bypasses_cache(self): + c = BrightDataClient(token="tok_1234567890", auto_create_zones=False) + async with c: + c.engine.get_from_url = MagicMock( + return_value=MockContextManager(MockResponse(200, json_data=[])) + ) + await c.get_account_info() + await c.get_account_info(refresh=True) + + assert c.engine.get_from_url.call_count == 2 + + @pytest.mark.asyncio + async def test_401_raises_auth_error(self): + c = BrightDataClient(token="tok_1234567890", auto_create_zones=False) + async with c: + c.engine.get_from_url = MagicMock( + return_value=MockContextManager(MockResponse(401, text_data="Unauthorized")) + ) + with pytest.raises(AuthenticationError, match="Invalid token"): + await c.get_account_info() + + @pytest.mark.asyncio + async def test_500_raises_api_error(self): + c = BrightDataClient(token="tok_1234567890", auto_create_zones=False) + async with c: + c.engine.get_from_url = MagicMock( + return_value=MockContextManager(MockResponse(500, text_data="Server Error")) + ) + with pytest.raises(APIError, match="Failed to get account info"): + await c.get_account_info() + + @pytest.mark.asyncio + async def test_empty_zones_warns(self): + c = BrightDataClient(token="tok_1234567890", auto_create_zones=False) + async with c: + c.engine.get_from_url = MagicMock( + return_value=MockContextManager(MockResponse(200, json_data=[])) + ) + with pytest.warns(UserWarning, match="No active zones"): + await c.get_account_info() + + +# --------------------------------------------------------------------------- +# list_zones / delete_zone +# --------------------------------------------------------------------------- + + +class TestZoneOperations: + @pytest.mark.asyncio + async def test_list_zones_delegates_to_zone_manager(self): + c = BrightDataClient(token="tok_1234567890", auto_create_zones=False) + async with c: + c._zone_manager = AsyncMock() + c._zone_manager.list_zones = AsyncMock(return_value=[{"name": "z1"}]) + + zones = await c.list_zones() + + assert zones == [{"name": "z1"}] + + @pytest.mark.asyncio + async def test_list_zones_creates_zone_manager(self): + c = BrightDataClient(token="tok_1234567890", auto_create_zones=False) + async with c: + # Mock engine to avoid real HTTP + c.engine.get = MagicMock( + return_value=MockContextManager(MockResponse(200, json_data=[])) + ) + zones = await c.list_zones() + assert c._zone_manager is not None + + @pytest.mark.asyncio + async def test_delete_zone_delegates(self): + c = BrightDataClient(token="tok_1234567890", auto_create_zones=False) + async with c: + c._zone_manager = AsyncMock() + c._zone_manager.delete_zone = AsyncMock() + + await c.delete_zone("test_zone") + + c._zone_manager.delete_zone.assert_called_once_with("test_zone") + + +# --------------------------------------------------------------------------- +# __repr__ +# --------------------------------------------------------------------------- + + +class TestRepr: + def test_includes_token_preview(self): + c = BrightDataClient(token="tok_1234567890_abcde") + r = repr(c) + assert "tok_12345" in r + assert "abcde" in r + + def test_shows_not_tested_by_default(self): + c = BrightDataClient(token="tok_1234567890") + assert "Not tested" in repr(c) diff --git a/tests/unit/test_constants.py b/tests/unit/test_constants.py deleted file mode 100644 index 4882828..0000000 --- a/tests/unit/test_constants.py +++ /dev/null @@ -1,274 +0,0 @@ -"""Unit tests for constants module.""" - -from brightdata import constants - - -class TestPollingConstants: - """Test polling configuration constants.""" - - def test_default_poll_interval_exists(self): - """Test DEFAULT_POLL_INTERVAL constant exists.""" - assert hasattr(constants, "DEFAULT_POLL_INTERVAL") - - def test_default_poll_interval_is_integer(self): - """Test DEFAULT_POLL_INTERVAL is an integer.""" - assert isinstance(constants.DEFAULT_POLL_INTERVAL, int) - - def test_default_poll_interval_is_positive(self): - """Test DEFAULT_POLL_INTERVAL is positive.""" - assert constants.DEFAULT_POLL_INTERVAL > 0 - - def test_default_poll_interval_value(self): - """Test DEFAULT_POLL_INTERVAL has expected value.""" - assert constants.DEFAULT_POLL_INTERVAL == 10 - - def test_default_poll_timeout_exists(self): - """Test DEFAULT_POLL_TIMEOUT constant exists.""" - assert hasattr(constants, "DEFAULT_POLL_TIMEOUT") - - def test_default_poll_timeout_is_integer(self): - """Test DEFAULT_POLL_TIMEOUT is an integer.""" - assert isinstance(constants.DEFAULT_POLL_TIMEOUT, int) - - def test_default_poll_timeout_is_positive(self): - """Test DEFAULT_POLL_TIMEOUT is positive.""" - assert constants.DEFAULT_POLL_TIMEOUT > 0 - - def test_default_poll_timeout_value(self): - """Test DEFAULT_POLL_TIMEOUT has expected value.""" - assert constants.DEFAULT_POLL_TIMEOUT == 600 - - def test_poll_timeout_greater_than_interval(self): - """Test DEFAULT_POLL_TIMEOUT is greater than DEFAULT_POLL_INTERVAL.""" - assert constants.DEFAULT_POLL_TIMEOUT > constants.DEFAULT_POLL_INTERVAL - - -class TestTimeoutConstants: - """Test timeout configuration constants.""" - - def test_default_timeout_short_exists(self): - """Test DEFAULT_TIMEOUT_SHORT constant exists.""" - assert hasattr(constants, "DEFAULT_TIMEOUT_SHORT") - - def test_default_timeout_short_is_integer(self): - """Test DEFAULT_TIMEOUT_SHORT is an integer.""" - assert isinstance(constants.DEFAULT_TIMEOUT_SHORT, int) - - def test_default_timeout_short_is_positive(self): - """Test DEFAULT_TIMEOUT_SHORT is positive.""" - assert constants.DEFAULT_TIMEOUT_SHORT > 0 - - def test_default_timeout_short_value(self): - """Test DEFAULT_TIMEOUT_SHORT has expected value.""" - assert constants.DEFAULT_TIMEOUT_SHORT == 180 - - def test_default_timeout_medium_exists(self): - """Test DEFAULT_TIMEOUT_MEDIUM constant exists.""" - assert hasattr(constants, "DEFAULT_TIMEOUT_MEDIUM") - - def test_default_timeout_medium_is_integer(self): - """Test DEFAULT_TIMEOUT_MEDIUM is an integer.""" - assert isinstance(constants.DEFAULT_TIMEOUT_MEDIUM, int) - - def test_default_timeout_medium_is_positive(self): - """Test DEFAULT_TIMEOUT_MEDIUM is positive.""" - assert constants.DEFAULT_TIMEOUT_MEDIUM > 0 - - def test_default_timeout_medium_value(self): - """Test DEFAULT_TIMEOUT_MEDIUM has expected value.""" - assert constants.DEFAULT_TIMEOUT_MEDIUM == 240 - - def test_default_timeout_long_exists(self): - """Test DEFAULT_TIMEOUT_LONG constant exists.""" - assert hasattr(constants, "DEFAULT_TIMEOUT_LONG") - - def test_default_timeout_long_is_integer(self): - """Test DEFAULT_TIMEOUT_LONG is an integer.""" - assert isinstance(constants.DEFAULT_TIMEOUT_LONG, int) - - def test_default_timeout_long_is_positive(self): - """Test DEFAULT_TIMEOUT_LONG is positive.""" - assert constants.DEFAULT_TIMEOUT_LONG > 0 - - def test_default_timeout_long_value(self): - """Test DEFAULT_TIMEOUT_LONG has expected value.""" - assert constants.DEFAULT_TIMEOUT_LONG == 120 - - def test_timeout_relationships(self): - """Test timeout constants have logical relationships.""" - # Medium should be greater than short - assert constants.DEFAULT_TIMEOUT_MEDIUM > constants.DEFAULT_TIMEOUT_SHORT - - -class TestScraperConstants: - """Test scraper configuration constants.""" - - def test_default_min_poll_timeout_exists(self): - """Test DEFAULT_MIN_POLL_TIMEOUT constant exists.""" - assert hasattr(constants, "DEFAULT_MIN_POLL_TIMEOUT") - - def test_default_min_poll_timeout_is_integer(self): - """Test DEFAULT_MIN_POLL_TIMEOUT is an integer.""" - assert isinstance(constants.DEFAULT_MIN_POLL_TIMEOUT, int) - - def test_default_min_poll_timeout_is_positive(self): - """Test DEFAULT_MIN_POLL_TIMEOUT is positive.""" - assert constants.DEFAULT_MIN_POLL_TIMEOUT > 0 - - def test_default_min_poll_timeout_value(self): - """Test DEFAULT_MIN_POLL_TIMEOUT has expected value.""" - assert constants.DEFAULT_MIN_POLL_TIMEOUT == 180 - - def test_default_cost_per_record_exists(self): - """Test DEFAULT_COST_PER_RECORD constant exists.""" - assert hasattr(constants, "DEFAULT_COST_PER_RECORD") - - def test_default_cost_per_record_is_float(self): - """Test DEFAULT_COST_PER_RECORD is a float.""" - assert isinstance(constants.DEFAULT_COST_PER_RECORD, float) - - def test_default_cost_per_record_is_positive(self): - """Test DEFAULT_COST_PER_RECORD is positive.""" - assert constants.DEFAULT_COST_PER_RECORD > 0 - - def test_default_cost_per_record_value(self): - """Test DEFAULT_COST_PER_RECORD has expected value.""" - assert constants.DEFAULT_COST_PER_RECORD == 0.001 - - -class TestConstantsDocumentation: - """Test constants have proper documentation.""" - - def test_default_poll_interval_has_docstring(self): - """Test DEFAULT_POLL_INTERVAL has documentation.""" - # Check module docstrings or comments exist - import inspect - - source = inspect.getsource(constants) - assert "DEFAULT_POLL_INTERVAL" in source - - def test_constants_module_has_docstring(self): - """Test constants module has docstring.""" - assert constants.__doc__ is not None - assert len(constants.__doc__) > 0 - - -class TestConstantsUsage: - """Test constants are used throughout the codebase.""" - - def test_constants_imported_in_base_scraper(self): - """Test constants are imported in base scraper.""" - from brightdata.scrapers import base - - # Should import from constants module - import inspect - - source = inspect.getsource(base) - assert "from ..constants import" in source or "constants" in source - - def test_constants_imported_in_polling(self): - """Test constants are imported in polling utilities.""" - from brightdata.utils import polling - - import inspect - - source = inspect.getsource(polling) - assert "from ..constants import" in source or "constants" in source - - def test_default_poll_interval_used_in_polling(self): - """Test DEFAULT_POLL_INTERVAL is used in polling module.""" - from brightdata.utils import polling - - import inspect - - source = inspect.getsource(polling) - assert "DEFAULT_POLL_INTERVAL" in source - - -class TestConstantsImmutability: - """Test constants maintain their values.""" - - def test_constants_are_not_none(self): - """Test all constants are not None.""" - assert constants.DEFAULT_POLL_INTERVAL is not None - assert constants.DEFAULT_POLL_TIMEOUT is not None - assert constants.DEFAULT_TIMEOUT_SHORT is not None - assert constants.DEFAULT_TIMEOUT_MEDIUM is not None - assert constants.DEFAULT_TIMEOUT_LONG is not None - assert constants.DEFAULT_MIN_POLL_TIMEOUT is not None - assert constants.DEFAULT_COST_PER_RECORD is not None - - def test_constants_have_expected_types(self): - """Test all constants have expected types.""" - # Integer constants - assert isinstance(constants.DEFAULT_POLL_INTERVAL, int) - assert isinstance(constants.DEFAULT_POLL_TIMEOUT, int) - assert isinstance(constants.DEFAULT_TIMEOUT_SHORT, int) - assert isinstance(constants.DEFAULT_TIMEOUT_MEDIUM, int) - assert isinstance(constants.DEFAULT_TIMEOUT_LONG, int) - assert isinstance(constants.DEFAULT_MIN_POLL_TIMEOUT, int) - - # Float constant - assert isinstance(constants.DEFAULT_COST_PER_RECORD, float) - - -class TestConstantsExports: - """Test constants module exports.""" - - def test_can_import_constants_from_brightdata(self): - """Test can import constants from brightdata package.""" - from brightdata import constants as const - - assert const is not None - assert hasattr(const, "DEFAULT_POLL_INTERVAL") - - def test_can_import_specific_constants(self): - """Test can import specific constants.""" - from brightdata.constants import ( - DEFAULT_POLL_INTERVAL, - DEFAULT_POLL_TIMEOUT, - DEFAULT_TIMEOUT_SHORT, - DEFAULT_TIMEOUT_MEDIUM, - DEFAULT_TIMEOUT_LONG, - DEFAULT_MIN_POLL_TIMEOUT, - DEFAULT_COST_PER_RECORD, - ) - - assert DEFAULT_POLL_INTERVAL is not None - assert DEFAULT_POLL_TIMEOUT is not None - assert DEFAULT_TIMEOUT_SHORT is not None - assert DEFAULT_TIMEOUT_MEDIUM is not None - assert DEFAULT_TIMEOUT_LONG is not None - assert DEFAULT_MIN_POLL_TIMEOUT is not None - assert DEFAULT_COST_PER_RECORD is not None - - -class TestConstantsReasonableValues: - """Test constants have reasonable values for production use.""" - - def test_poll_interval_is_reasonable(self): - """Test poll interval is reasonable (not too frequent, not too slow).""" - # Should be between 1 and 60 seconds - assert 1 <= constants.DEFAULT_POLL_INTERVAL <= 60 - - def test_poll_timeout_is_reasonable(self): - """Test poll timeout is reasonable.""" - # Should be at least 1 minute, but not more than 30 minutes - assert 60 <= constants.DEFAULT_POLL_TIMEOUT <= 1800 - - def test_timeouts_are_reasonable(self): - """Test all timeout values are reasonable for API operations.""" - # All timeouts should be between 30 seconds and 10 minutes - assert 30 <= constants.DEFAULT_TIMEOUT_SHORT <= 600 - assert 30 <= constants.DEFAULT_TIMEOUT_MEDIUM <= 600 - assert 30 <= constants.DEFAULT_TIMEOUT_LONG <= 600 - - def test_cost_per_record_is_reasonable(self): - """Test cost per record is reasonable.""" - # Should be between $0.0001 and $0.01 per record - assert 0.0001 <= constants.DEFAULT_COST_PER_RECORD <= 0.01 - - def test_min_poll_timeout_is_reasonable(self): - """Test minimum poll timeout is reasonable.""" - # Should be at least 1 minute - assert constants.DEFAULT_MIN_POLL_TIMEOUT >= 60 diff --git a/tests/unit/test_engine.py b/tests/unit/test_engine.py index 958f4b2..2b82641 100644 --- a/tests/unit/test_engine.py +++ b/tests/unit/test_engine.py @@ -1 +1,314 @@ -"""Unit tests for engine.""" +"""Tests for core/engine.py — AsyncEngine HTTP communication.""" + +import asyncio +import ssl +from unittest.mock import AsyncMock, MagicMock, patch + +import aiohttp +import pytest + +from brightdata.core.engine import AsyncEngine +from brightdata.exceptions import AuthenticationError, NetworkError, SSLError + + +# --------------------------------------------------------------------------- +# Initialization +# --------------------------------------------------------------------------- + + +class TestEngineInit: + def test_stores_bearer_token(self): + engine = AsyncEngine(bearer_token="tok_abc") + assert engine.bearer_token == "tok_abc" + + def test_default_timeout(self): + engine = AsyncEngine(bearer_token="tok") + assert engine.timeout.total == 30 + + def test_custom_timeout(self): + engine = AsyncEngine(bearer_token="tok", timeout=120) + assert engine.timeout.total == 120 + + def test_default_rate_limit(self): + engine = AsyncEngine(bearer_token="tok") + assert engine._rate_limit == AsyncEngine.DEFAULT_RATE_LIMIT + + def test_custom_rate_limit(self): + engine = AsyncEngine(bearer_token="tok", rate_limit=5, rate_period=2.0) + assert engine._rate_limit == 5 + assert engine._rate_period == 2.0 + + def test_session_none_before_enter(self): + engine = AsyncEngine(bearer_token="tok") + assert engine._session is None + + +# --------------------------------------------------------------------------- +# Context manager +# --------------------------------------------------------------------------- + + +class TestEngineContextManager: + @pytest.mark.asyncio + async def test_creates_session_on_enter(self): + engine = AsyncEngine(bearer_token="tok_abc") + async with engine: + assert engine._session is not None + assert not engine._session.closed + + @pytest.mark.asyncio + async def test_closes_session_on_exit(self): + engine = AsyncEngine(bearer_token="tok_abc") + async with engine: + session = engine._session + assert engine._session is None + assert session.closed + + @pytest.mark.asyncio + async def test_idempotent_enter(self): + """Calling __aenter__ twice should reuse the same session.""" + engine = AsyncEngine(bearer_token="tok") + async with engine: + session1 = engine._session + await engine.__aenter__() + session2 = engine._session + assert session1 is session2 + + @pytest.mark.asyncio + async def test_session_headers_contain_auth(self): + engine = AsyncEngine(bearer_token="my_secret_token") + async with engine: + headers = engine._session.headers + assert headers["Authorization"] == "Bearer my_secret_token" + assert headers["Content-Type"] == "application/json" + assert "brightdata-sdk/" in headers["User-Agent"] + + +# --------------------------------------------------------------------------- +# Request routing (without real HTTP — tests the method dispatch) +# --------------------------------------------------------------------------- + + +class TestRequestRouting: + @pytest.mark.asyncio + async def test_raises_if_no_session(self): + engine = AsyncEngine(bearer_token="tok") + with pytest.raises(RuntimeError, match="must be used as async context manager"): + engine.request("GET", "/test") + + @pytest.mark.asyncio + async def test_get_delegates_to_request(self): + engine = AsyncEngine(bearer_token="tok") + async with engine: + with patch.object(engine, "request", return_value="cm") as mock_req: + result = engine.get("/endpoint", params={"a": "1"}) + mock_req.assert_called_once_with( + "GET", "/endpoint", params={"a": "1"}, headers=None + ) + + @pytest.mark.asyncio + async def test_post_delegates_to_request(self): + engine = AsyncEngine(bearer_token="tok") + async with engine: + with patch.object(engine, "request", return_value="cm") as mock_req: + result = engine.post("/endpoint", json_data={"k": "v"}) + mock_req.assert_called_once_with( + "POST", "/endpoint", json_data={"k": "v"}, params=None, headers=None + ) + + @pytest.mark.asyncio + async def test_delete_delegates_to_request(self): + engine = AsyncEngine(bearer_token="tok") + async with engine: + with patch.object(engine, "request", return_value="cm") as mock_req: + result = engine.delete("/endpoint") + mock_req.assert_called_once_with( + "DELETE", "/endpoint", json_data=None, params=None, headers=None + ) + + @pytest.mark.asyncio + async def test_request_builds_full_url(self): + """request() should prepend BASE_URL to the endpoint.""" + engine = AsyncEngine(bearer_token="tok") + async with engine: + cm = engine.request("GET", "/v3/test") + # The URL is stored inside the ResponseContextManager + assert cm._url == f"{AsyncEngine.BASE_URL}/v3/test" + + @pytest.mark.asyncio + async def test_request_merges_custom_headers(self): + engine = AsyncEngine(bearer_token="tok") + async with engine: + cm = engine.request("GET", "/test", headers={"X-Custom": "val"}) + assert cm._headers["X-Custom"] == "val" + # Original auth header should still be there + assert "Bearer tok" in cm._headers["Authorization"] + + @pytest.mark.asyncio + async def test_post_to_url_uses_full_url(self): + """post_to_url should NOT prepend BASE_URL.""" + engine = AsyncEngine(bearer_token="tok") + async with engine: + cm = engine.post_to_url("https://custom.api.com/trigger", json_data={"x": 1}) + assert cm._url == "https://custom.api.com/trigger" + + @pytest.mark.asyncio + async def test_get_from_url_uses_full_url(self): + engine = AsyncEngine(bearer_token="tok") + async with engine: + cm = engine.get_from_url("https://custom.api.com/data", params={"fmt": "json"}) + assert cm._url == "https://custom.api.com/data" + assert cm._params == {"fmt": "json"} + + +# --------------------------------------------------------------------------- +# Error handling (mock aiohttp session to simulate failures) +# --------------------------------------------------------------------------- + + +class TestErrorHandling: + @pytest.mark.asyncio + async def test_401_raises_authentication_error(self): + engine = AsyncEngine(bearer_token="bad_token") + async with engine: + # Mock the session.request to return a 401 response + mock_resp = AsyncMock() + mock_resp.status = 401 + mock_resp.text = AsyncMock(return_value="Unauthorized") + mock_resp.release = AsyncMock() + engine._session.request = AsyncMock(return_value=mock_resp) + + cm = engine.get("/test") + with pytest.raises(AuthenticationError, match="Unauthorized"): + async with cm: + pass + + @pytest.mark.asyncio + async def test_403_raises_authentication_error(self): + engine = AsyncEngine(bearer_token="tok") + async with engine: + mock_resp = AsyncMock() + mock_resp.status = 403 + mock_resp.text = AsyncMock(return_value="Forbidden") + mock_resp.release = AsyncMock() + engine._session.request = AsyncMock(return_value=mock_resp) + + cm = engine.get("/test") + with pytest.raises(AuthenticationError, match="Forbidden"): + async with cm: + pass + + @pytest.mark.asyncio + async def test_200_returns_response(self): + engine = AsyncEngine(bearer_token="tok") + async with engine: + mock_resp = AsyncMock() + mock_resp.status = 200 + mock_resp.json = AsyncMock(return_value={"ok": True}) + mock_resp.close = MagicMock() + engine._session.request = AsyncMock(return_value=mock_resp) + + async with engine.get("/test") as resp: + assert resp.status == 200 + data = await resp.json() + assert data == {"ok": True} + + @pytest.mark.asyncio + async def test_network_error_raises_network_error(self): + engine = AsyncEngine(bearer_token="tok") + async with engine: + engine._session.request = AsyncMock( + side_effect=aiohttp.ClientError("Connection refused") + ) + + cm = engine.get("/test") + with pytest.raises(NetworkError, match="Network error"): + async with cm: + pass + + @pytest.mark.asyncio + async def test_timeout_raises_timeout_error(self): + engine = AsyncEngine(bearer_token="tok") + async with engine: + engine._session.request = AsyncMock(side_effect=asyncio.TimeoutError()) + + cm = engine.get("/test") + with pytest.raises(TimeoutError, match="timeout"): + async with cm: + pass + + @pytest.mark.asyncio + async def test_ssl_error_raises_ssl_error(self): + engine = AsyncEngine(bearer_token="tok") + async with engine: + ssl_err = aiohttp.ClientConnectorCertificateError( + connection_key=MagicMock(), + certificate_error=ssl.SSLCertVerificationError("CERTIFICATE_VERIFY_FAILED"), + ) + engine._session.request = AsyncMock(side_effect=ssl_err) + + cm = engine.get("/test") + with pytest.raises(SSLError): + async with cm: + pass + + @pytest.mark.asyncio + async def test_os_error_with_ssl_raises_ssl_error(self): + engine = AsyncEngine(bearer_token="tok") + async with engine: + engine._session.request = AsyncMock( + side_effect=OSError("[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed") + ) + + cm = engine.get("/test") + with pytest.raises(SSLError): + async with cm: + pass + + @pytest.mark.asyncio + async def test_generic_os_error_raises_network_error(self): + """Non-SSL OSError should be NetworkError, not SSLError.""" + engine = AsyncEngine(bearer_token="tok") + async with engine: + engine._session.request = AsyncMock(side_effect=OSError("Connection reset by peer")) + + cm = engine.get("/test") + with pytest.raises(NetworkError, match="Network error"): + async with cm: + pass + + @pytest.mark.asyncio + async def test_response_closed_on_exit(self): + engine = AsyncEngine(bearer_token="tok") + async with engine: + mock_resp = MagicMock() + mock_resp.status = 200 + mock_resp.close = MagicMock() + engine._session.request = AsyncMock(return_value=mock_resp) + + async with engine.get("/test") as resp: + pass + mock_resp.close.assert_called_once() + + +# --------------------------------------------------------------------------- +# Rate limiter +# --------------------------------------------------------------------------- + + +class TestRateLimiter: + @pytest.mark.asyncio + async def test_rate_limiter_created_on_enter(self): + engine = AsyncEngine(bearer_token="tok", rate_limit=5) + async with engine: + if engine._rate_limiter is not None: + # aiolimiter installed + assert engine._rate_limiter is not None + # If aiolimiter not installed, limiter will be None — that's OK + + @pytest.mark.asyncio + async def test_rate_limiter_cleared_on_exit(self): + engine = AsyncEngine(bearer_token="tok", rate_limit=5) + async with engine: + pass + assert engine._rate_limiter is None diff --git a/tests/unit/test_engine_sharing.py b/tests/unit/test_engine_sharing.py deleted file mode 100644 index 4aa6ccd..0000000 --- a/tests/unit/test_engine_sharing.py +++ /dev/null @@ -1,217 +0,0 @@ -""" -Test script to verify AsyncEngine sharing across scrapers. - -This script verifies that the AsyncEngine duplication fix works correctly by: -1. Counting AsyncEngine instances before/after creating client -2. Accessing multiple scrapers and verifying only one engine exists -3. Ensuring resource efficiency and proper engine reuse - -Expected output: -- Before creating client: 0 engines -- After creating client: 1 engine -- After accessing all scrapers: 1 engine (SHOULD STILL BE 1) - -If this test passes, the fix is working correctly! -""" - -import gc -import sys -import os - -# Add src to path so we can import brightdata -sys.path.insert(0, os.path.join(os.path.dirname(__file__), "src")) - -from brightdata import BrightDataClient -from brightdata.core.engine import AsyncEngine - - -def count_engines(): - """Count the number of AsyncEngine instances in memory.""" - gc.collect() # Force garbage collection to get accurate count - engines = [obj for obj in gc.get_objects() if isinstance(obj, AsyncEngine)] - return len(engines) - - -def test_engine_sharing(): - """Test that only one engine is created and shared across all scrapers.""" - - print("=" * 70) - print("AsyncEngine Sharing Test") - print("=" * 70) - print() - - # Step 1: Check baseline (should be 0) - initial_count = count_engines() - print(f"✓ Step 1: Before creating client: {initial_count} engine(s)") - - if initial_count != 0: - print(f" ⚠️ Warning: Expected 0 engines, found {initial_count}") - print() - - # Step 2: Create client (should create 1 engine) - print("✓ Step 2: Creating BrightDataClient...") - - # Try to load token from environment, or use placeholder - token = os.getenv("BRIGHTDATA_API_TOKEN") - if not token: - print(" ⚠️ Warning: No BRIGHTDATA_API_TOKEN found, using placeholder") - token = "test_token_placeholder_12345" - - client = BrightDataClient(token=token) - - after_client_count = count_engines() - print(f"✓ Step 3: After creating client: {after_client_count} engine(s)") - - if after_client_count != 1: - print(f" ❌ FAILED: Expected 1 engine, found {after_client_count}") - return False - print() - - # Step 3: Access all scrapers (should still be 1 engine) - print("✓ Step 4: Accessing all scrapers...") - - scrapers_accessed = [] - - try: - # Access scrape services - _ = client.scrape.amazon - scrapers_accessed.append("amazon") - - _ = client.scrape.linkedin - scrapers_accessed.append("linkedin") - - _ = client.scrape.facebook - scrapers_accessed.append("facebook") - - _ = client.scrape.instagram - scrapers_accessed.append("instagram") - - _ = client.scrape.chatgpt - scrapers_accessed.append("chatgpt") - - # Access search services - _ = client.search.linkedin - scrapers_accessed.append("search.linkedin") - - _ = client.search.instagram - scrapers_accessed.append("search.instagram") - - _ = client.search.chatGPT - scrapers_accessed.append("search.chatGPT") - - print(f" Accessed {len(scrapers_accessed)} scrapers: {', '.join(scrapers_accessed)}") - - except Exception as e: - print(f" ⚠️ Warning: Error accessing scrapers: {e}") - - print() - - # Step 4: Count engines after accessing all scrapers - after_scrapers_count = count_engines() - print(f"✓ Step 5: After accessing all scrapers: {after_scrapers_count} engine(s)") - print() - - # Verify the result - print("=" * 70) - print("Test Results") - print("=" * 70) - - if after_scrapers_count == 1: - print("✅ SUCCESS! Only 1 AsyncEngine instance exists.") - print(" All scrapers are sharing the client's engine.") - print(" Resource efficiency: OPTIMAL") - print() - print(" Benefits:") - print(" • Single HTTP connection pool") - print(" • Unified rate limiting") - print(" • Reduced memory usage") - print(" • Better connection reuse") - return True - else: - print(f"❌ FAILED! Found {after_scrapers_count} AsyncEngine instances.") - print(" Expected: 1 engine (shared across all scrapers)") - print(f" Actual: {after_scrapers_count} engines (resource duplication)") - print() - print(" This means:") - print(" • Multiple connection pools created") - print(" • Inefficient resource usage") - print(" • Engine duplication not fixed") - return False - - -def test_standalone_scraper(): - """Test that standalone scrapers still work (backwards compatibility).""" - - print() - print("=" * 70) - print("Standalone Scraper Test (Backwards Compatibility)") - print("=" * 70) - print() - - # Clear any existing engines - gc.collect() - initial_count = count_engines() - - print(f"✓ Initial engine count: {initial_count}") - - # Import and create a standalone scraper - from brightdata.scrapers.amazon import AmazonScraper - - print("✓ Creating standalone AmazonScraper (without passing engine)...") - - try: - token = os.getenv("BRIGHTDATA_API_TOKEN", "test_token_placeholder_12345") - AmazonScraper(bearer_token=token) - - standalone_count = count_engines() - print(f"✓ After creating standalone scraper: {standalone_count} engine(s)") - - expected_count = initial_count + 1 - if standalone_count == expected_count: - print("✅ SUCCESS! Standalone scraper creates its own engine.") - print(" Backwards compatibility: MAINTAINED") - return True - else: - print(f"❌ FAILED! Expected {expected_count} engines, found {standalone_count}") - return False - - except Exception as e: - print(f"⚠️ Warning: Could not create standalone scraper: {e}") - print(" (This is expected if bearer token is missing)") - return True # Don't fail the test if token is missing - - -if __name__ == "__main__": - print() - print("╔" + "═" * 68 + "╗") - print("║" + " " * 15 + "AsyncEngine Duplication Fix Test" + " " * 20 + "║") - print("╚" + "═" * 68 + "╝") - print() - - # Run both tests - test1_passed = test_engine_sharing() - test2_passed = test_standalone_scraper() - - print() - print("=" * 70) - print("Final Results") - print("=" * 70) - print() - - if test1_passed and test2_passed: - print("✅ ALL TESTS PASSED!") - print() - print("The AsyncEngine duplication fix is working correctly:") - print("• Single engine shared across all client scrapers ✓") - print("• Standalone scrapers still create their own engine ✓") - print("• Backwards compatibility maintained ✓") - print("• Resource efficiency achieved ✓") - sys.exit(0) - else: - print("❌ SOME TESTS FAILED") - print() - if not test1_passed: - print("• Engine sharing test failed - duplication still exists") - if not test2_passed: - print("• Standalone scraper test failed - backwards compatibility broken") - sys.exit(1) diff --git a/tests/unit/test_facebook.py b/tests/unit/test_facebook.py deleted file mode 100644 index ed2bfa3..0000000 --- a/tests/unit/test_facebook.py +++ /dev/null @@ -1,262 +0,0 @@ -"""Unit tests for Facebook scraper.""" - -from brightdata import BrightDataClient -from brightdata.scrapers.facebook import FacebookScraper - - -class TestFacebookScraperURLBased: - """Test Facebook scraper (URL-based extraction).""" - - def test_facebook_scraper_has_posts_by_profile_method(self): - """Test Facebook scraper has posts_by_profile method (async-first API).""" - scraper = FacebookScraper(bearer_token="test_token_123456789") - - assert hasattr(scraper, "posts_by_profile") - assert callable(scraper.posts_by_profile) - - def test_facebook_scraper_has_posts_by_group_method(self): - """Test Facebook scraper has posts_by_group method (async-first API).""" - scraper = FacebookScraper(bearer_token="test_token_123456789") - - assert hasattr(scraper, "posts_by_group") - assert callable(scraper.posts_by_group) - - def test_facebook_scraper_has_posts_by_url_method(self): - """Test Facebook scraper has posts_by_url method (async-first API).""" - scraper = FacebookScraper(bearer_token="test_token_123456789") - - assert hasattr(scraper, "posts_by_url") - assert callable(scraper.posts_by_url) - - def test_facebook_scraper_has_comments_method(self): - """Test Facebook scraper has comments method (async-first API).""" - scraper = FacebookScraper(bearer_token="test_token_123456789") - - assert hasattr(scraper, "comments") - assert callable(scraper.comments) - - def test_facebook_scraper_has_reels_method(self): - """Test Facebook scraper has reels method (async-first API).""" - scraper = FacebookScraper(bearer_token="test_token_123456789") - - assert hasattr(scraper, "reels") - assert callable(scraper.reels) - - def test_posts_by_profile_method_signature(self): - """Test posts_by_profile method has correct signature.""" - import inspect - - scraper = FacebookScraper(bearer_token="test_token_123456789") - sig = inspect.signature(scraper.posts_by_profile) - - # Required: url parameter - assert "url" in sig.parameters - - # Optional filters - assert "num_of_posts" in sig.parameters - assert "posts_to_not_include" in sig.parameters - assert "start_date" in sig.parameters - assert "end_date" in sig.parameters - assert "timeout" in sig.parameters - - # Defaults - assert sig.parameters["timeout"].default == 240 - - def test_posts_by_group_method_signature(self): - """Test posts_by_group method has correct signature.""" - import inspect - - scraper = FacebookScraper(bearer_token="test_token_123456789") - sig = inspect.signature(scraper.posts_by_group) - - # Required: url - assert "url" in sig.parameters - - # Optional filters - assert "num_of_posts" in sig.parameters - assert "posts_to_not_include" in sig.parameters - assert "start_date" in sig.parameters - assert "end_date" in sig.parameters - assert "timeout" in sig.parameters - - # Defaults - assert sig.parameters["timeout"].default == 240 - - def test_posts_by_url_method_signature(self): - """Test posts_by_url method has correct signature.""" - import inspect - - scraper = FacebookScraper(bearer_token="test_token_123456789") - sig = inspect.signature(scraper.posts_by_url) - - assert "url" in sig.parameters - assert "timeout" in sig.parameters - assert sig.parameters["timeout"].default == 240 - - def test_comments_method_signature(self): - """Test comments method has correct signature.""" - import inspect - - scraper = FacebookScraper(bearer_token="test_token_123456789") - sig = inspect.signature(scraper.comments) - - assert "url" in sig.parameters - assert "num_of_comments" in sig.parameters - assert "comments_to_not_include" in sig.parameters - assert "start_date" in sig.parameters - assert "end_date" in sig.parameters - assert "timeout" in sig.parameters - assert sig.parameters["timeout"].default == 240 - - def test_reels_method_signature(self): - """Test reels method has correct signature.""" - import inspect - - scraper = FacebookScraper(bearer_token="test_token_123456789") - sig = inspect.signature(scraper.reels) - - assert "url" in sig.parameters - assert "num_of_posts" in sig.parameters - assert "posts_to_not_include" in sig.parameters - assert "start_date" in sig.parameters - assert "end_date" in sig.parameters - assert "timeout" in sig.parameters - assert sig.parameters["timeout"].default == 240 - - -class TestFacebookDatasetIDs: - """Test Facebook has correct dataset IDs.""" - - def test_scraper_has_all_dataset_ids(self): - """Test scraper has dataset IDs for all types.""" - scraper = FacebookScraper(bearer_token="test_token_123456789") - - assert scraper.DATASET_ID # Default: Posts by Profile - assert scraper.DATASET_ID_POSTS_PROFILE - assert scraper.DATASET_ID_POSTS_GROUP - assert scraper.DATASET_ID_POSTS_URL - assert scraper.DATASET_ID_COMMENTS - assert scraper.DATASET_ID_REELS - - # All should start with gd_ - assert scraper.DATASET_ID.startswith("gd_") - assert scraper.DATASET_ID_POSTS_PROFILE.startswith("gd_") - assert scraper.DATASET_ID_POSTS_GROUP.startswith("gd_") - assert scraper.DATASET_ID_POSTS_URL.startswith("gd_") - assert scraper.DATASET_ID_COMMENTS.startswith("gd_") - assert scraper.DATASET_ID_REELS.startswith("gd_") - - def test_scraper_has_platform_name(self): - """Test scraper has correct platform name.""" - scraper = FacebookScraper(bearer_token="test_token_123456789") - - assert scraper.PLATFORM_NAME == "facebook" - - def test_scraper_has_cost_per_record(self): - """Test scraper has cost per record.""" - scraper = FacebookScraper(bearer_token="test_token_123456789") - - assert hasattr(scraper, "COST_PER_RECORD") - assert isinstance(scraper.COST_PER_RECORD, (int, float)) - assert scraper.COST_PER_RECORD > 0 - - -class TestFacebookScraperRegistration: - """Test Facebook scraper is registered correctly.""" - - def test_facebook_is_registered(self): - """Test Facebook scraper is in registry.""" - from brightdata.scrapers.registry import is_platform_supported, get_registered_platforms - - assert is_platform_supported("facebook") - assert "facebook" in get_registered_platforms() - - def test_can_get_facebook_scraper_from_registry(self): - """Test can get Facebook scraper from registry.""" - from brightdata.scrapers.registry import get_scraper_for - - scraper_class = get_scraper_for("facebook") - assert scraper_class is not None - assert scraper_class.__name__ == "FacebookScraper" - - -class TestFacebookClientIntegration: - """Test Facebook scraper integration with BrightDataClient.""" - - def test_client_has_facebook_scraper_access(self): - """Test client provides access to Facebook scraper.""" - client = BrightDataClient(token="test_token_123456789") - - assert hasattr(client, "scrape") - assert hasattr(client.scrape, "facebook") - - def test_client_facebook_scraper_has_all_methods(self): - """Test client.scrape.facebook has all Facebook methods.""" - client = BrightDataClient(token="test_token_123456789") - - assert hasattr(client.scrape.facebook, "posts_by_profile") - assert hasattr(client.scrape.facebook, "posts_by_group") - assert hasattr(client.scrape.facebook, "posts_by_url") - assert hasattr(client.scrape.facebook, "comments") - assert hasattr(client.scrape.facebook, "reels") - - def test_facebook_scraper_instance_from_client(self): - """Test Facebook scraper instance is FacebookScraper.""" - client = BrightDataClient(token="test_token_123456789") - - assert isinstance(client.scrape.facebook, FacebookScraper) - - -class TestFacebookScraperConfiguration: - """Test Facebook scraper configuration.""" - - def test_scraper_initialization_with_token(self): - """Test scraper can be initialized with bearer token.""" - scraper = FacebookScraper(bearer_token="test_token_123456789") - - assert scraper.bearer_token == "test_token_123456789" - - def test_scraper_has_engine(self): - """Test scraper has engine instance.""" - scraper = FacebookScraper(bearer_token="test_token_123456789") - - assert hasattr(scraper, "engine") - assert scraper.engine is not None - - def test_scraper_has_api_client(self): - """Test scraper has API client.""" - scraper = FacebookScraper(bearer_token="test_token_123456789") - - assert hasattr(scraper, "api_client") - assert scraper.api_client is not None - - def test_scraper_has_workflow_executor(self): - """Test scraper has workflow executor.""" - scraper = FacebookScraper(bearer_token="test_token_123456789") - - assert hasattr(scraper, "workflow_executor") - assert scraper.workflow_executor is not None - - -class TestFacebookScraperExports: - """Test Facebook scraper is properly exported.""" - - def test_facebook_scraper_in_module_exports(self): - """Test FacebookScraper is in scrapers module __all__.""" - from brightdata import scrapers - - assert "FacebookScraper" in scrapers.__all__ - - def test_can_import_facebook_scraper_directly(self): - """Test can import FacebookScraper directly.""" - from brightdata.scrapers import FacebookScraper as FB - - assert FB is not None - assert FB.__name__ == "FacebookScraper" - - def test_can_import_from_facebook_submodule(self): - """Test can import from facebook submodule.""" - from brightdata.scrapers.facebook import FacebookScraper as FB - - assert FB is not None - assert FB.__name__ == "FacebookScraper" diff --git a/tests/unit/test_function_detection.py b/tests/unit/test_function_detection.py deleted file mode 100644 index fcf1319..0000000 --- a/tests/unit/test_function_detection.py +++ /dev/null @@ -1,251 +0,0 @@ -"""Unit tests for function detection utilities.""" - -from brightdata.utils.function_detection import get_caller_function_name - - -class TestFunctionDetection: - """Test function name detection utilities.""" - - def test_get_caller_function_name_exists(self): - """Test get_caller_function_name function exists.""" - assert callable(get_caller_function_name) - - def test_get_caller_function_name_returns_string(self): - """Test get_caller_function_name returns a string.""" - - def test_function(): - return get_caller_function_name() - - result = test_function() - assert isinstance(result, str) - - def test_get_caller_function_name_detects_caller(self): - """Test get_caller_function_name detects calling function name.""" - - def outer_function(): - return get_caller_function_name() - - result = outer_function() - # Should detect 'outer_function' or similar - assert len(result) > 0 - - def test_get_caller_function_name_in_nested_calls(self): - """Test get_caller_function_name works in nested function calls.""" - - def level_3(): - return get_caller_function_name() - - def level_2(): - return level_3() - - def level_1(): - return level_2() - - result = level_1() - # Should return a valid function name - assert isinstance(result, str) - assert len(result) > 0 - - def test_get_caller_function_name_handles_no_caller(self): - """Test get_caller_function_name handles cases with no clear caller.""" - # Call from module level (no function context) - result = get_caller_function_name() - # Should return something (empty string, None, or a default) - assert result is not None - - -class TestFunctionDetectionInScrapers: - """Test function detection is used in scrapers.""" - - def test_function_detection_imported_in_base_scraper(self): - """Test function detection is imported in base scraper.""" - from brightdata.scrapers import base - - import inspect - - source = inspect.getsource(base) - assert "get_caller_function_name" in source or "function_detection" in source - - def test_function_detection_used_for_sdk_function_parameter(self): - """Test function detection is used to set sdk_function parameter.""" - from brightdata.scrapers import base - - # Check if sdk_function parameter is used in base scraper - import inspect - - source = inspect.getsource(base) - assert "sdk_function" in source - - -class TestSDKFunctionParameterTracking: - """Test sdk_function parameter tracking in scrapers.""" - - def test_amazon_scraper_methods_accept_sdk_function(self): - """Test Amazon scraper methods can track sdk_function.""" - from brightdata.scrapers.amazon import AmazonScraper - import inspect - - scraper = AmazonScraper(bearer_token="test_token_123456789") - - # Amazon uses _scrape_with_params which may have sdk_function - # Note: Amazon's _scrape_urls doesn't have sdk_function, but it's - # passed through workflow_executor.execute() which does accept it - if hasattr(scraper, "_scrape_with_params"): - inspect.signature(scraper._scrape_with_params) - # sdk_function is handled internally via get_caller_function_name() - assert True # Test passes - sdk_function is tracked via function detection - - def test_linkedin_scraper_methods_accept_sdk_function(self): - """Test LinkedIn scraper methods can track sdk_function.""" - from brightdata.scrapers.linkedin import LinkedInScraper - import inspect - - scraper = LinkedInScraper(bearer_token="test_token_123456789") - - # LinkedIn uses _scrape_with_params which may have sdk_function - # Note: LinkedIn's _scrape_urls doesn't have sdk_function, but it's - # passed through workflow_executor.execute() which does accept it - if hasattr(scraper, "_scrape_with_params"): - inspect.signature(scraper._scrape_with_params) - # sdk_function is handled internally via get_caller_function_name() - assert True # Test passes - sdk_function is tracked via function detection - - def test_facebook_scraper_methods_accept_sdk_function(self): - """Test Facebook scraper methods can track sdk_function.""" - from brightdata.scrapers.facebook import FacebookScraper - import inspect - - scraper = FacebookScraper(bearer_token="test_token_123456789") - - # Check if internal methods accept sdk_function parameter - if hasattr(scraper, "_scrape_urls"): - sig = inspect.signature(scraper._scrape_urls) - assert "sdk_function" in sig.parameters - - def test_instagram_scraper_methods_use_function_detection(self): - """Test Instagram scraper methods use function detection internally.""" - from brightdata.scrapers.instagram import InstagramScraper - import inspect - - scraper = InstagramScraper(bearer_token="test_token_123456789") - - # Instagram scraper's _scrape_urls calls get_caller_function_name() internally - # rather than accepting sdk_function as a parameter - if hasattr(scraper, "_scrape_urls"): - # Verify the method exists and is callable - assert callable(scraper._scrape_urls) - # Check it has the expected parameters (url, dataset_id, timeout) - sig = inspect.signature(scraper._scrape_urls) - assert "url" in sig.parameters - assert "dataset_id" in sig.parameters - assert "timeout" in sig.parameters - - -class TestSDKFunctionUsagePatterns: - """Test sdk_function parameter usage patterns.""" - - def test_sdk_function_can_be_none(self): - """Test sdk_function parameter can be None.""" - # Function detection should handle None gracefully - result = get_caller_function_name() - # Should return a string (possibly empty) or None, not crash - assert result is None or isinstance(result, str) - - def test_sdk_function_provides_context_for_monitoring(self): - """Test sdk_function provides context for monitoring and analytics.""" - # This is a design test - sdk_function should be passed through - # the workflow executor to enable analytics - from brightdata.scrapers.workflow import WorkflowExecutor - import inspect - - # Check if WorkflowExecutor.execute accepts sdk_function - sig = inspect.signature(WorkflowExecutor.execute) - assert "sdk_function" in sig.parameters - - -class TestFunctionDetectionEdgeCases: - """Test function detection edge cases.""" - - def test_function_detection_with_lambda(self): - """Test function detection with lambda functions.""" - - def func(): - return get_caller_function_name() - - result = func() - # Should handle lambda gracefully - assert result is None or isinstance(result, str) - - def test_function_detection_with_method(self): - """Test function detection with class methods.""" - - class TestClass: - def method(self): - return get_caller_function_name() - - obj = TestClass() - result = obj.method() - # Should detect method name - assert isinstance(result, str) - - def test_function_detection_with_static_method(self): - """Test function detection with static methods.""" - - class TestClass: - @staticmethod - def static_method(): - return get_caller_function_name() - - result = TestClass.static_method() - # Should handle static method - assert result is None or isinstance(result, str) - - def test_function_detection_with_class_method(self): - """Test function detection with class methods.""" - - class TestClass: - @classmethod - def class_method(cls): - return get_caller_function_name() - - result = TestClass.class_method() - # Should handle class method - assert result is None or isinstance(result, str) - - -class TestFunctionDetectionPerformance: - """Test function detection performance characteristics.""" - - def test_function_detection_is_fast(self): - """Test function detection doesn't add significant overhead.""" - import time - - def test_function(): - return get_caller_function_name() - - # Measure time for 1000 calls - start = time.time() - for _ in range(1000): - test_function() - elapsed = time.time() - start - - # Should complete in less than 1 second for 1000 calls - assert elapsed < 1.0 - - def test_function_detection_doesnt_cause_memory_leak(self): - """Test function detection doesn't cause memory leaks.""" - import sys - - def test_function(): - return get_caller_function_name() - - # Get initial reference count - initial_refs = sys.getrefcount(test_function) - - # Call many times - for _ in range(100): - test_function() - - # Reference count shouldn't grow significantly - final_refs = sys.getrefcount(test_function) - assert final_refs <= initial_refs + 5 # Allow small variation diff --git a/tests/unit/test_instagram.py b/tests/unit/test_instagram.py deleted file mode 100644 index 0f43e64..0000000 --- a/tests/unit/test_instagram.py +++ /dev/null @@ -1,390 +0,0 @@ -"""Unit tests for Instagram scraper.""" - -from brightdata import BrightDataClient -from brightdata.scrapers.instagram import InstagramScraper, InstagramSearchScraper - - -class TestInstagramScraperURLBased: - """Test Instagram scraper (URL-based extraction).""" - - def test_instagram_scraper_has_profiles_method(self): - """Test Instagram scraper has profiles method (async-first API).""" - scraper = InstagramScraper(bearer_token="test_token_123456789") - - assert hasattr(scraper, "profiles") - assert callable(scraper.profiles) - - def test_instagram_scraper_has_posts_method(self): - """Test Instagram scraper has posts method (async-first API).""" - scraper = InstagramScraper(bearer_token="test_token_123456789") - - assert hasattr(scraper, "posts") - assert callable(scraper.posts) - - def test_instagram_scraper_has_comments_method(self): - """Test Instagram scraper has comments method (async-first API).""" - scraper = InstagramScraper(bearer_token="test_token_123456789") - - assert hasattr(scraper, "comments") - assert callable(scraper.comments) - - def test_instagram_scraper_has_reels_method(self): - """Test Instagram scraper has reels method (async-first API).""" - scraper = InstagramScraper(bearer_token="test_token_123456789") - - assert hasattr(scraper, "reels") - assert callable(scraper.reels) - - def test_profiles_method_signature(self): - """Test profiles method has correct signature.""" - import inspect - - scraper = InstagramScraper(bearer_token="test_token_123456789") - sig = inspect.signature(scraper.profiles) - - # Required: url parameter - assert "url" in sig.parameters - assert "timeout" in sig.parameters - - # Defaults (180s = DEFAULT_TIMEOUT_SHORT, same as LinkedIn) - assert sig.parameters["timeout"].default == 180 - - def test_posts_method_signature(self): - """Test posts method has correct signature.""" - import inspect - - scraper = InstagramScraper(bearer_token="test_token_123456789") - sig = inspect.signature(scraper.posts) - - assert "url" in sig.parameters - assert "timeout" in sig.parameters - assert sig.parameters["timeout"].default == 180 - - def test_comments_method_signature(self): - """Test comments method has correct signature.""" - import inspect - - scraper = InstagramScraper(bearer_token="test_token_123456789") - sig = inspect.signature(scraper.comments) - - assert "url" in sig.parameters - assert "timeout" in sig.parameters - assert sig.parameters["timeout"].default == 180 - - def test_reels_method_signature(self): - """Test reels method has correct signature.""" - import inspect - - scraper = InstagramScraper(bearer_token="test_token_123456789") - sig = inspect.signature(scraper.reels) - - assert "url" in sig.parameters - assert "timeout" in sig.parameters - assert sig.parameters["timeout"].default == 180 - - -class TestInstagramSearchScraper: - """Test Instagram search scraper (parameter-based discovery).""" - - def test_instagram_search_scraper_has_profiles_method(self): - """Test Instagram search scraper has profiles method for username discovery.""" - scraper = InstagramSearchScraper(bearer_token="test_token_123456789") - - assert hasattr(scraper, "profiles") - assert callable(scraper.profiles) - - def test_instagram_search_scraper_has_posts_method(self): - """Test Instagram search scraper has posts method (async-first API).""" - scraper = InstagramSearchScraper(bearer_token="test_token_123456789") - - assert hasattr(scraper, "posts") - assert callable(scraper.posts) - - def test_instagram_search_scraper_has_reels_method(self): - """Test Instagram search scraper has reels method (async-first API).""" - scraper = InstagramSearchScraper(bearer_token="test_token_123456789") - - assert hasattr(scraper, "reels") - assert callable(scraper.reels) - - def test_instagram_search_scraper_has_reels_all_method(self): - """Test Instagram search scraper has reels_all method.""" - scraper = InstagramSearchScraper(bearer_token="test_token_123456789") - - assert hasattr(scraper, "reels_all") - assert callable(scraper.reels_all) - - def test_search_profiles_method_signature(self): - """Test search profiles method has correct signature.""" - import inspect - - scraper = InstagramSearchScraper(bearer_token="test_token_123456789") - sig = inspect.signature(scraper.profiles) - - # Required: user_name parameter (NOT url) - assert "user_name" in sig.parameters - assert "timeout" in sig.parameters - assert sig.parameters["timeout"].default == 180 - - def test_search_posts_method_signature(self): - """Test search posts method has correct signature.""" - import inspect - - scraper = InstagramSearchScraper(bearer_token="test_token_123456789") - sig = inspect.signature(scraper.posts) - - # Required: url parameter - assert "url" in sig.parameters - - # Optional filters - assert "num_of_posts" in sig.parameters - assert "start_date" in sig.parameters - assert "end_date" in sig.parameters - assert "post_type" in sig.parameters - assert "posts_to_not_include" in sig.parameters - assert "timeout" in sig.parameters - - # Defaults (180s = DEFAULT_TIMEOUT_SHORT) - assert sig.parameters["timeout"].default == 180 - - def test_search_reels_method_signature(self): - """Test search reels method has correct signature.""" - import inspect - - scraper = InstagramSearchScraper(bearer_token="test_token_123456789") - sig = inspect.signature(scraper.reels) - - assert "url" in sig.parameters - assert "num_of_posts" in sig.parameters - assert "start_date" in sig.parameters - assert "end_date" in sig.parameters - assert "timeout" in sig.parameters - assert sig.parameters["timeout"].default == 180 - - def test_search_reels_all_method_signature(self): - """Test search reels_all method has correct signature.""" - import inspect - - scraper = InstagramSearchScraper(bearer_token="test_token_123456789") - sig = inspect.signature(scraper.reels_all) - - assert "url" in sig.parameters - assert "num_of_posts" in sig.parameters - assert "start_date" in sig.parameters - assert "end_date" in sig.parameters - assert "timeout" in sig.parameters - assert sig.parameters["timeout"].default == 180 - - -class TestInstagramDatasetIDs: - """Test Instagram has correct dataset IDs.""" - - def test_scraper_has_all_dataset_ids(self): - """Test scraper has dataset IDs for all types.""" - scraper = InstagramScraper(bearer_token="test_token_123456789") - - assert scraper.DATASET_ID # Default: Profiles - assert scraper.DATASET_ID_POSTS - assert scraper.DATASET_ID_COMMENTS - assert scraper.DATASET_ID_REELS - - # All should start with gd_ - assert scraper.DATASET_ID.startswith("gd_") - assert scraper.DATASET_ID_POSTS.startswith("gd_") - assert scraper.DATASET_ID_COMMENTS.startswith("gd_") - assert scraper.DATASET_ID_REELS.startswith("gd_") - - def test_search_scraper_has_dataset_ids(self): - """Test search scraper has dataset IDs.""" - scraper = InstagramSearchScraper(bearer_token="test_token_123456789") - - assert scraper.DATASET_ID_PROFILES - assert scraper.DATASET_ID_POSTS - assert scraper.DATASET_ID_REELS - - assert scraper.DATASET_ID_PROFILES.startswith("gd_") - assert scraper.DATASET_ID_POSTS.startswith("gd_") - assert scraper.DATASET_ID_REELS.startswith("gd_") - - def test_scraper_has_platform_name(self): - """Test scraper has correct platform name.""" - scraper = InstagramScraper(bearer_token="test_token_123456789") - - assert scraper.PLATFORM_NAME == "instagram" - - def test_scraper_has_cost_per_record(self): - """Test scraper has cost per record.""" - scraper = InstagramScraper(bearer_token="test_token_123456789") - - assert hasattr(scraper, "COST_PER_RECORD") - assert isinstance(scraper.COST_PER_RECORD, (int, float)) - assert scraper.COST_PER_RECORD > 0 - - -class TestInstagramScraperRegistration: - """Test Instagram scraper is registered correctly.""" - - def test_instagram_is_registered(self): - """Test Instagram scraper is in registry.""" - from brightdata.scrapers.registry import is_platform_supported, get_registered_platforms - - assert is_platform_supported("instagram") - assert "instagram" in get_registered_platforms() - - def test_can_get_instagram_scraper_from_registry(self): - """Test can get Instagram scraper from registry.""" - from brightdata.scrapers.registry import get_scraper_for - - scraper_class = get_scraper_for("instagram") - assert scraper_class is not None - assert scraper_class.__name__ == "InstagramScraper" - - -class TestInstagramClientIntegration: - """Test Instagram scraper integration with BrightDataClient.""" - - def test_client_has_instagram_scraper_access(self): - """Test client provides access to Instagram scraper.""" - client = BrightDataClient(token="test_token_123456789") - - assert hasattr(client, "scrape") - assert hasattr(client.scrape, "instagram") - - def test_client_instagram_scraper_has_all_methods(self): - """Test client.scrape.instagram has all Instagram methods.""" - client = BrightDataClient(token="test_token_123456789") - - assert hasattr(client.scrape.instagram, "profiles") - assert hasattr(client.scrape.instagram, "posts") - assert hasattr(client.scrape.instagram, "comments") - assert hasattr(client.scrape.instagram, "reels") - - def test_instagram_scraper_instance_from_client(self): - """Test Instagram scraper instance is InstagramScraper.""" - client = BrightDataClient(token="test_token_123456789") - - assert isinstance(client.scrape.instagram, InstagramScraper) - - def test_client_has_instagram_search_access(self): - """Test client provides access to Instagram search.""" - client = BrightDataClient(token="test_token_123456789") - - assert hasattr(client, "search") - assert hasattr(client.search, "instagram") - - def test_client_instagram_search_has_methods(self): - """Test client.search.instagram has discovery methods.""" - client = BrightDataClient(token="test_token_123456789") - - assert hasattr(client.search.instagram, "profiles") - assert hasattr(client.search.instagram, "posts") - assert hasattr(client.search.instagram, "reels") - assert hasattr(client.search.instagram, "reels_all") - - def test_instagram_search_instance_from_client(self): - """Test Instagram search instance is InstagramSearchScraper.""" - client = BrightDataClient(token="test_token_123456789") - - assert isinstance(client.search.instagram, InstagramSearchScraper) - - -class TestInstagramScraperConfiguration: - """Test Instagram scraper configuration.""" - - def test_scraper_initialization_with_token(self): - """Test scraper can be initialized with bearer token.""" - scraper = InstagramScraper(bearer_token="test_token_123456789") - - assert scraper.bearer_token == "test_token_123456789" - - def test_search_scraper_initialization_with_token(self): - """Test search scraper can be initialized with bearer token.""" - scraper = InstagramSearchScraper(bearer_token="test_token_123456789") - - assert scraper.bearer_token == "test_token_123456789" - - def test_scraper_has_engine(self): - """Test scraper has engine instance.""" - scraper = InstagramScraper(bearer_token="test_token_123456789") - - assert hasattr(scraper, "engine") - assert scraper.engine is not None - - def test_search_scraper_has_engine(self): - """Test search scraper has engine instance.""" - scraper = InstagramSearchScraper(bearer_token="test_token_123456789") - - assert hasattr(scraper, "engine") - assert scraper.engine is not None - - def test_scraper_has_api_client(self): - """Test scraper has API client.""" - scraper = InstagramScraper(bearer_token="test_token_123456789") - - assert hasattr(scraper, "api_client") - assert scraper.api_client is not None - - def test_scraper_has_workflow_executor(self): - """Test scraper has workflow executor.""" - scraper = InstagramScraper(bearer_token="test_token_123456789") - - assert hasattr(scraper, "workflow_executor") - assert scraper.workflow_executor is not None - - -class TestInstagramScraperExports: - """Test Instagram scraper is properly exported.""" - - def test_instagram_scraper_in_module_exports(self): - """Test InstagramScraper is in scrapers module __all__.""" - from brightdata import scrapers - - assert "InstagramScraper" in scrapers.__all__ - - def test_instagram_search_scraper_in_module_exports(self): - """Test InstagramSearchScraper is in scrapers module __all__.""" - from brightdata import scrapers - - assert "InstagramSearchScraper" in scrapers.__all__ - - def test_can_import_instagram_scraper_directly(self): - """Test can import InstagramScraper directly.""" - from brightdata.scrapers import InstagramScraper as IG - - assert IG is not None - assert IG.__name__ == "InstagramScraper" - - def test_can_import_instagram_search_scraper_directly(self): - """Test can import InstagramSearchScraper directly.""" - from brightdata.scrapers import InstagramSearchScraper as IGSearch - - assert IGSearch is not None - assert IGSearch.__name__ == "InstagramSearchScraper" - - def test_can_import_from_instagram_submodule(self): - """Test can import from instagram submodule.""" - from brightdata.scrapers.instagram import InstagramScraper as IG - from brightdata.scrapers.instagram import InstagramSearchScraper as IGSearch - - assert IG is not None - assert IG.__name__ == "InstagramScraper" - assert IGSearch is not None - assert IGSearch.__name__ == "InstagramSearchScraper" - - -class TestInstagramDiscoveryExtraParams: - """Test Instagram discovery uses extra_params correctly.""" - - def test_search_scraper_has_execute_discovery_method(self): - """Test search scraper has internal _execute_discovery method.""" - scraper = InstagramSearchScraper(bearer_token="test_token_123456789") - - assert hasattr(scraper, "_execute_discovery") - assert callable(scraper._execute_discovery) - - def test_search_scraper_has_context_manager(self): - """Test search scraper supports async context manager.""" - scraper = InstagramSearchScraper(bearer_token="test_token_123456789") - - assert hasattr(scraper, "__aenter__") - assert hasattr(scraper, "__aexit__") diff --git a/tests/unit/test_linkedin.py b/tests/unit/test_linkedin.py deleted file mode 100644 index 4e30902..0000000 --- a/tests/unit/test_linkedin.py +++ /dev/null @@ -1,535 +0,0 @@ -"""Unit tests for LinkedIn scraper and search services.""" - -from brightdata import BrightDataClient -from brightdata.scrapers.linkedin import LinkedInScraper, LinkedInSearchScraper - - -class TestLinkedInScraperURLBased: - """Test LinkedIn scraper (URL-based extraction).""" - - def test_linkedin_scraper_has_posts_method(self): - """Test LinkedIn scraper has posts method (async-first API).""" - scraper = LinkedInScraper(bearer_token="test_token_123456789") - - assert hasattr(scraper, "posts") - assert callable(scraper.posts) - - def test_linkedin_scraper_has_jobs_method(self): - """Test LinkedIn scraper has jobs method (async-first API).""" - scraper = LinkedInScraper(bearer_token="test_token_123456789") - - assert hasattr(scraper, "jobs") - assert callable(scraper.jobs) - - def test_linkedin_scraper_has_profiles_method(self): - """Test LinkedIn scraper has profiles method (async-first API).""" - scraper = LinkedInScraper(bearer_token="test_token_123456789") - - assert hasattr(scraper, "profiles") - assert callable(scraper.profiles) - - def test_linkedin_scraper_has_companies_method(self): - """Test LinkedIn scraper has companies method (async-first API).""" - scraper = LinkedInScraper(bearer_token="test_token_123456789") - - assert hasattr(scraper, "companies") - assert callable(scraper.companies) - - def test_posts_method_signature(self): - """Test posts method has correct signature.""" - import inspect - - scraper = LinkedInScraper(bearer_token="test_token_123456789") - sig = inspect.signature(scraper.posts) - - # Required: url parameter - assert "url" in sig.parameters - - # Optional: sync and timeout - assert "sync" not in sig.parameters - assert "timeout" in sig.parameters - - # Defaults - assert sig.parameters["timeout"].default == 180 - - def test_jobs_method_signature(self): - """Test jobs method has correct signature.""" - import inspect - - scraper = LinkedInScraper(bearer_token="test_token_123456789") - sig = inspect.signature(scraper.jobs) - - assert "url" in sig.parameters - assert "sync" not in sig.parameters - assert "timeout" in sig.parameters - assert sig.parameters["timeout"].default == 180 - - def test_profiles_method_signature(self): - """Test profiles method has correct signature.""" - import inspect - - scraper = LinkedInScraper(bearer_token="test_token_123456789") - sig = inspect.signature(scraper.profiles) - - assert "url" in sig.parameters - assert "sync" not in sig.parameters - assert "timeout" in sig.parameters - - def test_companies_method_signature(self): - """Test companies method has correct signature.""" - import inspect - - scraper = LinkedInScraper(bearer_token="test_token_123456789") - sig = inspect.signature(scraper.companies) - - assert "url" in sig.parameters - assert "sync" not in sig.parameters - assert "timeout" in sig.parameters - - -class TestLinkedInSearchScraper: - """Test LinkedIn search service (discovery/parameter-based).""" - - def test_linkedin_search_has_posts_method(self): - """Test LinkedIn search has posts discovery method (async-first API).""" - search = LinkedInSearchScraper(bearer_token="test_token_123456789") - - assert hasattr(search, "posts") - assert callable(search.posts) - - def test_linkedin_search_has_profiles_method(self): - """Test LinkedIn search has profiles discovery method (async-first API).""" - search = LinkedInSearchScraper(bearer_token="test_token_123456789") - - assert hasattr(search, "profiles") - assert callable(search.profiles) - - def test_linkedin_search_has_jobs_method(self): - """Test LinkedIn search has jobs discovery method (async-first API).""" - search = LinkedInSearchScraper(bearer_token="test_token_123456789") - - assert hasattr(search, "jobs") - assert callable(search.jobs) - - def test_search_posts_signature(self): - """Test search.posts has correct signature.""" - import inspect - - search = LinkedInSearchScraper(bearer_token="test_token_123456789") - sig = inspect.signature(search.posts) - - # Required: url (profile URL) - assert "url" in sig.parameters - - # Optional: start_date, end_date, timeout - assert "start_date" in sig.parameters - assert "end_date" in sig.parameters - assert "timeout" in sig.parameters - - def test_search_profiles_signature(self): - """Test search.profiles has correct signature.""" - import inspect - - search = LinkedInSearchScraper(bearer_token="test_token_123456789") - sig = inspect.signature(search.profiles) - - # Required: first_name - assert "first_name" in sig.parameters - - # Optional: last_name, timeout - assert "last_name" in sig.parameters - assert "timeout" in sig.parameters - - def test_search_jobs_signature(self): - """Test search.jobs has correct signature.""" - import inspect - - search = LinkedInSearchScraper(bearer_token="test_token_123456789") - sig = inspect.signature(search.jobs) - - # All parameters should be present - params = sig.parameters - assert "url" in params - assert "location" in params - assert "keyword" in params - assert "country" in params - assert "timeRange" in params - assert "jobType" in params - assert "experienceLevel" in params - assert "remote" in params - assert "company" in params - assert "locationRadius" in params - assert "timeout" in params - - -class TestLinkedInDualNamespaces: - """Test LinkedIn has both scrape and search namespaces.""" - - def test_client_has_scrape_linkedin(self): - """Test client.scrape.linkedin exists.""" - client = BrightDataClient(token="test_token_123456789") - - scraper = client.scrape.linkedin - assert scraper is not None - assert isinstance(scraper, LinkedInScraper) - - def test_client_has_search_linkedin(self): - """Test client.search.linkedin exists.""" - client = BrightDataClient(token="test_token_123456789") - - search = client.search.linkedin - assert search is not None - assert isinstance(search, LinkedInSearchScraper) - - def test_scrape_vs_search_distinction(self): - """Test clear distinction between scrape and search.""" - client = BrightDataClient(token="test_token_123456789") - - scraper = client.scrape.linkedin - search = client.search.linkedin - - # Scraper uses 'url' parameter - import inspect - - scraper_sig = inspect.signature(scraper.posts) - assert "url" in scraper_sig.parameters - assert "sync" not in scraper_sig.parameters # sync parameter was removed - - # Search uses url + date range parameters - search_sig = inspect.signature(search.posts) - assert "url" in search_sig.parameters - assert "start_date" in search_sig.parameters - - def test_scrape_linkedin_methods_accept_url_list(self): - """Test scrape.linkedin methods accept url as str | list.""" - import inspect - - client = BrightDataClient(token="test_token_123456789") - scraper = client.scrape.linkedin - - # Check type hints - sig = inspect.signature(scraper.posts) - url_param = sig.parameters["url"] - - # Should accept Union[str, List[str]] - annotation_str = str(url_param.annotation) - assert "str" in annotation_str - assert "List" in annotation_str or "list" in annotation_str - - -class TestLinkedInDatasetIDs: - """Test LinkedIn has correct dataset IDs for each type.""" - - def test_scraper_has_all_dataset_ids(self): - """Test scraper has dataset IDs for all types.""" - scraper = LinkedInScraper(bearer_token="test_token_123456789") - - assert scraper.DATASET_ID # Profiles - assert scraper.DATASET_ID_COMPANIES - assert scraper.DATASET_ID_JOBS - assert scraper.DATASET_ID_POSTS - - # All should start with gd_ - assert scraper.DATASET_ID.startswith("gd_") - assert scraper.DATASET_ID_COMPANIES.startswith("gd_") - assert scraper.DATASET_ID_JOBS.startswith("gd_") - assert scraper.DATASET_ID_POSTS.startswith("gd_") - - def test_search_has_dataset_ids(self): - """Test search service has dataset IDs.""" - search = LinkedInSearchScraper(bearer_token="test_token_123456789") - - assert search.DATASET_ID_POSTS - assert search.DATASET_ID_PROFILES - assert search.DATASET_ID_JOBS - - -class TestSyncVsAsyncMode: - """Test sync vs async mode handling.""" - - def test_default_timeout_is_correct(self): - """Test default timeout is 180s for async workflow.""" - import inspect - - scraper = LinkedInScraper(bearer_token="test_token_123456789") - sig = inspect.signature(scraper.posts) - - assert sig.parameters["timeout"].default == 180 - - def test_methods_dont_have_sync_parameter(self): - """Test all scrape methods don't have sync parameter (standard async pattern).""" - import inspect - - scraper = LinkedInScraper(bearer_token="test_token_123456789") - - for method_name in ["posts", "jobs", "profiles", "companies"]: - sig = inspect.signature(getattr(scraper, method_name)) - assert "sync" not in sig.parameters - - -class TestAPISpecCompliance: - """Test compliance with exact API specifications.""" - - def test_scrape_posts_api_spec(self): - """Test client.scrape.linkedin.posts matches API spec.""" - client = BrightDataClient(token="test_token_123456789") - - # API Spec: client.scrape.linkedin.posts(url, timeout=180) - import inspect - - sig = inspect.signature(client.scrape.linkedin.posts) - - assert "url" in sig.parameters - assert "sync" not in sig.parameters - assert "timeout" in sig.parameters - assert sig.parameters["timeout"].default == 180 - - def test_search_posts_api_spec(self): - """Test client.search.linkedin.posts matches API spec.""" - client = BrightDataClient(token="test_token_123456789") - - # API Spec: posts(url, start_date, end_date) - import inspect - - sig = inspect.signature(client.search.linkedin.posts) - - assert "url" in sig.parameters - assert "start_date" in sig.parameters - assert "end_date" in sig.parameters - - def test_search_profiles_api_spec(self): - """Test client.search.linkedin.profiles matches API spec.""" - client = BrightDataClient(token="test_token_123456789") - - # API Spec: profiles(first_name, last_name, timeout) - import inspect - - sig = inspect.signature(client.search.linkedin.profiles) - - assert "first_name" in sig.parameters - assert "last_name" in sig.parameters - assert "timeout" in sig.parameters - - def test_search_jobs_api_spec(self): - """Test client.search.linkedin.jobs matches API spec.""" - client = BrightDataClient(token="test_token_123456789") - - # API Spec: jobs(url, location, keyword, country, ...) - import inspect - - sig = inspect.signature(client.search.linkedin.jobs) - - params = sig.parameters - assert "url" in params - assert "location" in params - assert "keyword" in params - assert "country" in params - assert "timeRange" in params - assert "jobType" in params - assert "experienceLevel" in params - assert "remote" in params - assert "company" in params - assert "locationRadius" in params - assert "timeout" in params - - -class TestLinkedInClientIntegration: - """Test LinkedIn integrates properly with client.""" - - def test_linkedin_accessible_via_client_scrape(self): - """Test LinkedIn scraper accessible via client.scrape.linkedin.""" - client = BrightDataClient(token="test_token_123456789") - - linkedin = client.scrape.linkedin - assert linkedin is not None - assert isinstance(linkedin, LinkedInScraper) - - def test_linkedin_accessible_via_client_search(self): - """Test LinkedIn search accessible via client.search.linkedin.""" - client = BrightDataClient(token="test_token_123456789") - - linkedin_search = client.search.linkedin - assert linkedin_search is not None - assert isinstance(linkedin_search, LinkedInSearchScraper) - - def test_client_passes_token_to_scraper(self): - """Test client passes token to LinkedIn scraper.""" - token = "test_token_123456789" - client = BrightDataClient(token=token) - - linkedin = client.scrape.linkedin - assert linkedin.bearer_token == token - - def test_client_passes_token_to_search(self): - """Test client passes token to LinkedIn search.""" - token = "test_token_123456789" - client = BrightDataClient(token=token) - - search = client.search.linkedin - assert search.bearer_token == token - - -class TestInterfaceExamples: - """Test interface examples from specifications.""" - - def test_scrape_posts_interface(self): - """Test scrape.linkedin.posts interface.""" - client = BrightDataClient(token="test_token_123456789") - - # Interface: posts(url=str|list, timeout=180) - linkedin = client.scrape.linkedin - - # Should be callable - assert callable(linkedin.posts) - - # Accepts url, sync, timeout - import inspect - - sig = inspect.signature(linkedin.posts) - assert set(["url", "timeout"]).issubset(sig.parameters.keys()) - - def test_search_posts_interface(self): - """Test search.linkedin.posts interface.""" - client = BrightDataClient(token="test_token_123456789") - - # Interface: posts(url, start_date, end_date) - linkedin_search = client.search.linkedin - - assert callable(linkedin_search.posts) - - import inspect - - sig = inspect.signature(linkedin_search.posts) - assert "url" in sig.parameters - assert "start_date" in sig.parameters - assert "end_date" in sig.parameters - - def test_search_jobs_interface(self): - """Test search.linkedin.jobs interface.""" - client = BrightDataClient(token="test_token_123456789") - - # Interface: jobs(url, location, keyword, ..many filters) - linkedin_search = client.search.linkedin - - assert callable(linkedin_search.jobs) - - import inspect - - sig = inspect.signature(linkedin_search.jobs) - - # All the filters from spec - expected_params = [ - "url", - "location", - "keyword", - "country", - "timeRange", - "jobType", - "experienceLevel", - "remote", - "company", - "locationRadius", - "timeout", - ] - - for param in expected_params: - assert param in sig.parameters - - -class TestParameterArraySupport: - """Test array parameter support (str | array).""" - - def test_url_accepts_string(self): - """Test url parameter accepts single string.""" - import inspect - - scraper = LinkedInScraper(bearer_token="test_token_123456789") - sig = inspect.signature(scraper.posts) - - # Type annotation should allow str | List[str] - url_annotation = str(sig.parameters["url"].annotation) - assert "Union" in url_annotation or "|" in url_annotation - assert "str" in url_annotation - - def test_url_accepts_array_in_search_posts(self): - """Test url accepts arrays in search posts.""" - import inspect - - search = LinkedInSearchScraper(bearer_token="test_token_123456789") - sig = inspect.signature(search.posts) - - # url should accept str | list - annotation = str(sig.parameters["url"].annotation) - assert "Union" in annotation or "str" in annotation - - -class TestAsyncFirstAPI: - """Test all methods follow async-first pattern.""" - - def test_scraper_has_all_methods(self): - """Test scraper has all methods (async-first API, no _async suffix).""" - scraper = LinkedInScraper(bearer_token="test_token_123456789") - - methods = ["posts", "jobs", "profiles", "companies"] - - for method in methods: - assert hasattr(scraper, method) - assert callable(getattr(scraper, method)) - - def test_search_has_all_methods(self): - """Test search has all methods (async-first API, no _async suffix).""" - search = LinkedInSearchScraper(bearer_token="test_token_123456789") - - methods = ["posts", "profiles", "jobs"] - - for method in methods: - assert hasattr(search, method) - assert callable(getattr(search, method)) - - -class TestPhilosophicalPrinciples: - """Test LinkedIn follows philosophical principles.""" - - def test_clear_scrape_vs_search_distinction(self): - """Test clear distinction between scrape (URL) and search (params).""" - client = BrightDataClient(token="test_token_123456789") - - scraper = client.scrape.linkedin - search = client.search.linkedin - - # Scraper is for URLs - import inspect - - scraper_posts_sig = inspect.signature(scraper.posts) - assert "url" in scraper_posts_sig.parameters - - # Search is for discovery parameters (url + date range) - search_posts_sig = inspect.signature(search.posts) - assert "url" in search_posts_sig.parameters - assert "start_date" in search_posts_sig.parameters - - def test_consistent_timeout_defaults(self): - """Test consistent timeout defaults across methods.""" - client = BrightDataClient(token="test_token_123456789") - - scraper = client.scrape.linkedin - - import inspect - - # All scrape methods should default to 65s - for method_name in ["posts", "jobs", "profiles", "companies"]: - sig = inspect.signature(getattr(scraper, method_name)) - assert sig.parameters["timeout"].default == 180 - - def test_uses_standard_async_workflow(self): - """Test methods use standard async workflow (no sync parameter).""" - client = BrightDataClient(token="test_token_123456789") - - scraper = client.scrape.linkedin - - import inspect - - sig = inspect.signature(scraper.posts) - - # Should not have sync parameter - assert "sync" not in sig.parameters diff --git a/tests/unit/test_models.py b/tests/unit/test_models.py index 1ac5fa7..3d7ee11 100644 --- a/tests/unit/test_models.py +++ b/tests/unit/test_models.py @@ -1,6 +1,9 @@ -"""Unit tests for result models.""" +"""Tests for result models — Creation, timing, serialization, and method tracking.""" +import json from datetime import datetime, timezone + + from brightdata.models import ( BaseResult, ScrapeResult, @@ -9,61 +12,46 @@ ) -class TestBaseResult: - """Tests for BaseResult class.""" +# --------------------------------------------------------------------------- +# BaseResult +# --------------------------------------------------------------------------- - def test_creation(self): - """Test basic creation of BaseResult.""" + +class TestBaseResult: + def test_creation_defaults(self): result = BaseResult(success=True) assert result.success is True assert result.cost is None assert result.error is None - def test_elapsed_ms(self): - """Test elapsed time calculation.""" + def test_elapsed_ms_zero_delta(self): now = datetime.now(timezone.utc) - result = BaseResult( - success=True, - trigger_sent_at=now, - data_fetched_at=now, - ) + result = BaseResult(success=True, trigger_sent_at=now, data_fetched_at=now) elapsed = result.elapsed_ms() assert elapsed is not None assert elapsed >= 0 - def test_elapsed_ms_with_delta(self): - """Test elapsed time with actual time difference.""" + def test_elapsed_ms_one_second(self): start = datetime(2025, 1, 1, 12, 0, 0) end = datetime(2025, 1, 1, 12, 0, 1) - result = BaseResult( - success=True, - trigger_sent_at=start, - data_fetched_at=end, - ) + result = BaseResult(success=True, trigger_sent_at=start, data_fetched_at=end) assert result.elapsed_ms() == 1000.0 - def test_get_timing_breakdown(self): - """Test timing breakdown generation.""" + def test_timing_breakdown_keys(self): now = datetime.now(timezone.utc) - result = BaseResult( - success=True, - trigger_sent_at=now, - data_fetched_at=now, - ) + result = BaseResult(success=True, trigger_sent_at=now, data_fetched_at=now) breakdown = result.get_timing_breakdown() assert "total_elapsed_ms" in breakdown assert "trigger_sent_at" in breakdown assert "data_fetched_at" in breakdown def test_to_dict(self): - """Test conversion to dictionary.""" result = BaseResult(success=True, cost=0.001) data = result.to_dict() assert data["success"] is True assert data["cost"] == 0.001 def test_to_json(self): - """Test JSON serialization.""" result = BaseResult(success=True, cost=0.001) json_str = result.to_json() assert isinstance(json_str, str) @@ -71,7 +59,6 @@ def test_to_json(self): assert "0.001" in json_str def test_save_to_file(self, tmp_path): - """Test saving to file.""" result = BaseResult(success=True, cost=0.001) filepath = tmp_path / "result.json" result.save_to_file(filepath) @@ -82,22 +69,19 @@ def test_save_to_file(self, tmp_path): assert "0.001" in content -class TestScrapeResult: - """Tests for ScrapeResult class.""" +# --------------------------------------------------------------------------- +# ScrapeResult +# --------------------------------------------------------------------------- + +class TestScrapeResult: def test_creation(self): - """Test basic creation of ScrapeResult.""" - result = ScrapeResult( - success=True, - url="https://example.com", - status="ready", - ) + result = ScrapeResult(success=True, url="https://example.com", status="ready") assert result.success is True assert result.url == "https://example.com" assert result.status == "ready" def test_with_platform(self): - """Test ScrapeResult with platform.""" result = ScrapeResult( success=True, url="https://www.linkedin.com/in/test", @@ -107,7 +91,6 @@ def test_with_platform(self): assert result.platform == "linkedin" def test_timing_breakdown_with_polling(self): - """Test timing breakdown includes polling information.""" start = datetime(2025, 1, 1, 12, 0, 0) snapshot_received = datetime(2025, 1, 1, 12, 0, 1) end = datetime(2025, 1, 1, 12, 0, 5) @@ -128,22 +111,19 @@ def test_timing_breakdown_with_polling(self): assert breakdown["poll_count"] == 2 -class TestSearchResult: - """Tests for SearchResult class.""" +# --------------------------------------------------------------------------- +# SearchResult +# --------------------------------------------------------------------------- + +class TestSearchResult: def test_creation(self): - """Test basic creation of SearchResult.""" - query = {"q": "python", "engine": "google"} - result = SearchResult( - success=True, - query=query, - ) + result = SearchResult(success=True, query={"q": "python", "engine": "google"}) assert result.success is True - assert result.query == query + assert result.query == {"q": "python", "engine": "google"} assert result.total_found is None def test_with_total_found(self): - """Test SearchResult with total results.""" result = SearchResult( success=True, query={"q": "python"}, @@ -154,36 +134,28 @@ def test_with_total_found(self): assert result.search_engine == "google" -class TestCrawlResult: - """Tests for CrawlResult class.""" +# --------------------------------------------------------------------------- +# CrawlResult +# --------------------------------------------------------------------------- + +class TestCrawlResult: def test_creation(self): - """Test basic creation of CrawlResult.""" - result = CrawlResult( - success=True, - domain="example.com", - ) + result = CrawlResult(success=True, domain="example.com") assert result.success is True assert result.domain == "example.com" assert result.pages == [] def test_with_pages(self): - """Test CrawlResult with crawled pages.""" pages = [ {"url": "https://example.com/page1", "data": {}}, {"url": "https://example.com/page2", "data": {}}, ] - result = CrawlResult( - success=True, - domain="example.com", - pages=pages, - total_pages=2, - ) + result = CrawlResult(success=True, domain="example.com", pages=pages, total_pages=2) assert len(result.pages) == 2 assert result.total_pages == 2 def test_timing_breakdown_with_crawl_duration(self): - """Test timing breakdown includes crawl duration.""" crawl_start = datetime(2025, 1, 1, 12, 0, 0) crawl_end = datetime(2025, 1, 1, 12, 5, 0) @@ -199,170 +171,77 @@ def test_timing_breakdown_with_crawl_duration(self): assert breakdown["crawl_duration_ms"] == 300000.0 -class TestInterfaceRequirements: - """Test all interface requirements are met.""" - - def test_common_fields(self): - """Test common fields across all results.""" - result = BaseResult(success=True, cost=0.001, error=None) - assert hasattr(result, "success") - assert hasattr(result, "cost") - assert hasattr(result, "error") - assert hasattr(result, "trigger_sent_at") - assert hasattr(result, "data_fetched_at") - - def test_common_methods(self): - """Test common methods across all results.""" - result = BaseResult(success=True) - assert hasattr(result, "elapsed_ms") - assert hasattr(result, "to_json") - assert hasattr(result, "save_to_file") - assert hasattr(result, "get_timing_breakdown") - - def test_scrape_specific_fields(self): - """Test ScrapeResult specific fields.""" - scrape = ScrapeResult(success=True, url="https://example.com", status="ready") - assert hasattr(scrape, "url") - assert hasattr(scrape, "platform") - assert hasattr(scrape, "method") - - def test_search_specific_fields(self): - """Test SearchResult specific fields.""" - search = SearchResult(success=True, query={"q": "test"}) - assert hasattr(search, "query") - assert hasattr(search, "total_found") - - def test_crawl_specific_fields(self): - """Test CrawlResult specific fields.""" - crawl = CrawlResult(success=True, domain="example.com") - assert hasattr(crawl, "domain") - assert hasattr(crawl, "pages") +# --------------------------------------------------------------------------- +# Method field tracking +# --------------------------------------------------------------------------- class TestMethodFieldTracking: - """Tests for method field tracking in results.""" - - def test_scrape_result_accepts_method_parameter(self): - """Test ScrapeResult accepts method parameter.""" + def test_accepts_method_parameter(self): result = ScrapeResult( - success=True, - url="https://example.com", - status="ready", - method="web_scraper", + success=True, url="https://example.com", status="ready", method="web_scraper" ) assert result.method == "web_scraper" - def test_scrape_result_method_can_be_web_unlocker(self): - """Test ScrapeResult method can be 'web_unlocker'.""" + def test_method_web_unlocker(self): result = ScrapeResult( - success=True, - url="https://example.com", - status="ready", - method="web_unlocker", + success=True, url="https://example.com", status="ready", method="web_unlocker" ) assert result.method == "web_unlocker" - def test_scrape_result_method_can_be_browser_api(self): - """Test ScrapeResult method can be 'browser_api'.""" + def test_method_browser_api(self): result = ScrapeResult( - success=True, - url="https://example.com", - status="ready", - method="browser_api", + success=True, url="https://example.com", status="ready", method="browser_api" ) assert result.method == "browser_api" - def test_scrape_result_method_defaults_to_none(self): - """Test ScrapeResult method defaults to None.""" - result = ScrapeResult( - success=True, - url="https://example.com", - status="ready", - ) + def test_method_defaults_to_none(self): + result = ScrapeResult(success=True, url="https://example.com", status="ready") assert result.method is None - def test_method_included_in_to_dict(self): - """Test method field is included in to_dict output.""" + def test_method_in_to_dict(self): result = ScrapeResult( - success=True, - url="https://example.com", - status="ready", - method="web_scraper", + success=True, url="https://example.com", status="ready", method="web_scraper" ) data = result.to_dict() - assert "method" in data assert data["method"] == "web_scraper" - def test_method_included_in_json(self): - """Test method field is included in JSON output.""" + def test_method_in_json(self): result = ScrapeResult( - success=True, - url="https://example.com", - status="ready", - method="web_unlocker", + success=True, url="https://example.com", status="ready", method="web_unlocker" ) json_str = result.to_json() - assert "method" in json_str assert "web_unlocker" in json_str def test_method_persists_through_serialization(self): - """Test method field persists through serialization.""" - import json - result = ScrapeResult( - success=True, - url="https://example.com", - status="ready", - method="browser_api", + success=True, url="https://example.com", status="ready", method="browser_api" ) - - # Serialize to dict and back data = result.to_dict() assert data["method"] == "browser_api" - # Serialize to JSON and parse - json_str = result.to_json() - parsed = json.loads(json_str) + parsed = json.loads(result.to_json()) assert parsed["method"] == "browser_api" - -class TestMethodFieldIntegration: - """Test method field integration with scrapers.""" - - def test_method_field_tracks_scraping_approach(self): - """Test method field effectively tracks scraping approach.""" - # Test all three methods - methods = ["web_scraper", "web_unlocker", "browser_api"] - - for method in methods: + def test_all_methods_valid(self): + for method in ["web_scraper", "web_unlocker", "browser_api"]: result = ScrapeResult( - success=True, - url="https://example.com", - status="ready", - method=method, + success=True, url="https://example.com", status="ready", method=method ) assert result.method == method - assert result.method in ["web_scraper", "web_unlocker", "browser_api"] - def test_method_field_helps_identify_data_source(self): - """Test method field helps identify data source.""" - # Different methods might have different characteristics - web_scraper = ScrapeResult( + def test_method_distinguishes_data_source(self): + ws = ScrapeResult( success=True, url="https://example.com", status="ready", method="web_scraper", platform="linkedin", ) - - web_unlocker = ScrapeResult( + wu = ScrapeResult( success=True, url="https://example.com", status="ready", method="web_unlocker", ) - - # Both valid, but method provides context - assert web_scraper.method == "web_scraper" - assert web_unlocker.method == "web_unlocker" - assert web_scraper.method != web_unlocker.method + assert ws.method != wu.method diff --git a/tests/unit/test_payloads.py b/tests/unit/test_payloads.py index 3282657..6d764ed 100644 --- a/tests/unit/test_payloads.py +++ b/tests/unit/test_payloads.py @@ -1,13 +1,4 @@ -""" -Tests for dataclass-based payloads. - -Tests validate: -- Runtime validation -- Default values -- Helper methods and properties -- Error handling -- Conversion to dict -""" +"""Tests for payload dataclasses — Validation, defaults, and serialization.""" import pytest from brightdata.payloads import ( @@ -32,15 +23,16 @@ ) -class TestAmazonPayloads: - """Test Amazon payload dataclasses.""" +# --------------------------------------------------------------------------- +# Amazon +# --------------------------------------------------------------------------- + - def test_amazon_product_payload_valid(self): - """Test valid Amazon product payload.""" +class TestAmazonPayloads: + def test_product_payload_valid(self): payload = AmazonProductPayload( url="https://amazon.com/dp/B0CRMZHDG8", reviews_count=50, images_count=10 ) - assert payload.url == "https://amazon.com/dp/B0CRMZHDG8" assert payload.reviews_count == 50 assert payload.images_count == 10 @@ -49,296 +41,248 @@ def test_amazon_product_payload_valid(self): assert payload.domain == "amazon.com" assert payload.is_secure is True - def test_amazon_product_payload_defaults(self): - """Test Amazon product payload with defaults.""" + def test_product_payload_defaults(self): payload = AmazonProductPayload(url="https://amazon.com/dp/B123456789") - assert payload.reviews_count is None assert payload.images_count is None - def test_amazon_product_payload_invalid_url(self): - """Test Amazon product payload with invalid URL.""" + def test_product_payload_rejects_non_amazon_url(self): with pytest.raises(ValueError, match="url must be an Amazon URL"): AmazonProductPayload(url="https://ebay.com/item/123") - def test_amazon_product_payload_negative_count(self): - """Test Amazon product payload with negative count.""" + def test_product_payload_rejects_negative_count(self): with pytest.raises(ValueError, match="reviews_count must be non-negative"): AmazonProductPayload(url="https://amazon.com/dp/B123", reviews_count=-1) - def test_amazon_product_payload_to_dict(self): - """Test converting Amazon product payload to dict.""" + def test_product_payload_to_dict_excludes_none(self): payload = AmazonProductPayload(url="https://amazon.com/dp/B123", reviews_count=50) - result = payload.to_dict() assert result == {"url": "https://amazon.com/dp/B123", "reviews_count": 50} - # images_count (None) should not be in dict assert "images_count" not in result - def test_amazon_review_payload_valid(self): - """Test valid Amazon review payload.""" + def test_review_payload_valid(self): payload = AmazonReviewPayload( url="https://amazon.com/dp/B123", pastDays=30, keyWord="quality", numOfReviews=100 ) - assert payload.pastDays == 30 assert payload.keyWord == "quality" assert payload.numOfReviews == 100 -class TestLinkedInPayloads: - """Test LinkedIn payload dataclasses.""" +# --------------------------------------------------------------------------- +# LinkedIn +# --------------------------------------------------------------------------- - def test_linkedin_profile_payload_valid(self): - """Test valid LinkedIn profile payload.""" - payload = LinkedInProfilePayload(url="https://linkedin.com/in/johndoe") +class TestLinkedInPayloads: + def test_profile_payload_valid(self): + payload = LinkedInProfilePayload(url="https://linkedin.com/in/johndoe") assert payload.url == "https://linkedin.com/in/johndoe" assert "linkedin.com" in payload.domain - def test_linkedin_profile_payload_invalid_url(self): - """Test LinkedIn profile payload with invalid URL.""" + def test_profile_payload_rejects_non_linkedin_url(self): with pytest.raises(ValueError, match="url must be a LinkedIn URL"): LinkedInProfilePayload(url="https://facebook.com/johndoe") - def test_linkedin_profile_search_payload_valid(self): - """Test valid LinkedIn profile search payload.""" + def test_profile_search_payload_valid(self): payload = LinkedInProfileSearchPayload(firstName="John", lastName="Doe", company="Google") - assert payload.firstName == "John" assert payload.lastName == "Doe" assert payload.company == "Google" - def test_linkedin_profile_search_payload_empty_firstname(self): - """Test LinkedIn profile search with empty firstName.""" + def test_profile_search_rejects_empty_firstname(self): with pytest.raises(ValueError, match="firstName is required"): LinkedInProfileSearchPayload(firstName="") - def test_linkedin_job_search_payload_valid(self): - """Test valid LinkedIn job search payload.""" + def test_job_search_payload_valid(self): payload = LinkedInJobSearchPayload( keyword="python developer", location="New York", remote=True, experienceLevel="mid" ) - assert payload.keyword == "python developer" assert payload.location == "New York" assert payload.remote is True assert payload.is_remote_search is True - def test_linkedin_job_search_payload_no_criteria(self): - """Test LinkedIn job search with no search criteria.""" + def test_job_search_rejects_no_criteria(self): with pytest.raises(ValueError, match="At least one search parameter required"): LinkedInJobSearchPayload() - def test_linkedin_job_search_payload_invalid_country(self): - """Test LinkedIn job search with invalid country code.""" + def test_job_search_rejects_invalid_country_code(self): with pytest.raises(ValueError, match="country must be 2-letter code"): - LinkedInJobSearchPayload(keyword="python", country="USA") # Should be "US" + LinkedInJobSearchPayload(keyword="python", country="USA") - def test_linkedin_post_search_payload_valid(self): - """Test valid LinkedIn post search payload.""" + def test_post_search_payload_valid(self): payload = LinkedInPostSearchPayload( url="https://linkedin.com/in/johndoe", start_date="2025-01-01", end_date="2025-12-31" ) - assert payload.start_date == "2025-01-01" assert payload.end_date == "2025-12-31" - def test_linkedin_post_search_payload_invalid_date(self): - """Test LinkedIn post search with invalid date format.""" + def test_post_search_rejects_invalid_date_format(self): with pytest.raises(ValueError, match="start_date must be in yyyy-mm-dd format"): LinkedInPostSearchPayload( - url="https://linkedin.com/in/johndoe", start_date="01-01-2025" # Wrong format + url="https://linkedin.com/in/johndoe", start_date="01-01-2025" ) -class TestChatGPTPayloads: - """Test ChatGPT payload dataclasses.""" +# --------------------------------------------------------------------------- +# ChatGPT +# --------------------------------------------------------------------------- + - def test_chatgpt_prompt_payload_valid(self): - """Test valid ChatGPT prompt payload.""" +class TestChatGPTPayloads: + def test_prompt_payload_valid(self): payload = ChatGPTPromptPayload( prompt="Explain Python async programming", country="US", web_search=True ) - assert payload.prompt == "Explain Python async programming" assert payload.country == "US" assert payload.web_search is True assert payload.uses_web_search is True - def test_chatgpt_prompt_payload_defaults(self): - """Test ChatGPT prompt payload defaults.""" + def test_prompt_payload_defaults(self): payload = ChatGPTPromptPayload(prompt="Test prompt") - assert payload.country == "US" assert payload.web_search is False assert payload.additional_prompt is None - def test_chatgpt_prompt_payload_empty_prompt(self): - """Test ChatGPT payload with empty prompt.""" + def test_prompt_payload_rejects_empty_prompt(self): with pytest.raises(ValueError, match="prompt is required"): ChatGPTPromptPayload(prompt="") - def test_chatgpt_prompt_payload_invalid_country(self): - """Test ChatGPT payload with invalid country code.""" + def test_prompt_payload_rejects_invalid_country(self): with pytest.raises(ValueError, match="country must be 2-letter code"): - ChatGPTPromptPayload(prompt="Test", country="USA") # Should be "US" + ChatGPTPromptPayload(prompt="Test", country="USA") - def test_chatgpt_prompt_payload_too_long(self): - """Test ChatGPT payload with prompt too long.""" + def test_prompt_payload_rejects_too_long(self): with pytest.raises(ValueError, match="prompt too long"): ChatGPTPromptPayload(prompt="x" * 10001) -class TestFacebookPayloads: - """Test Facebook payload dataclasses.""" +# --------------------------------------------------------------------------- +# Facebook +# --------------------------------------------------------------------------- + - def test_facebook_posts_profile_payload_valid(self): - """Test valid Facebook posts profile payload.""" +class TestFacebookPayloads: + def test_posts_profile_payload_valid(self): payload = FacebookPostsProfilePayload( url="https://facebook.com/profile", num_of_posts=10, start_date="01-01-2025", end_date="12-31-2025", ) - assert payload.url == "https://facebook.com/profile" assert payload.num_of_posts == 10 assert payload.start_date == "01-01-2025" - def test_facebook_posts_profile_payload_invalid_url(self): - """Test Facebook payload with invalid URL.""" + def test_posts_profile_rejects_non_facebook_url(self): with pytest.raises(ValueError, match="url must be a Facebook URL"): FacebookPostsProfilePayload(url="https://twitter.com/user") - def test_facebook_posts_group_payload_valid(self): - """Test valid Facebook posts group payload.""" + def test_posts_group_payload_valid(self): payload = FacebookPostsGroupPayload( url="https://facebook.com/groups/example", num_of_posts=20 ) - assert payload.url == "https://facebook.com/groups/example" assert payload.num_of_posts == 20 - def test_facebook_posts_group_payload_not_group(self): - """Test Facebook group payload without /groups/ in URL.""" + def test_posts_group_rejects_non_group_url(self): with pytest.raises(ValueError, match="url must be a Facebook group URL"): FacebookPostsGroupPayload(url="https://facebook.com/profile") - def test_facebook_comments_payload_valid(self): - """Test valid Facebook comments payload.""" + def test_comments_payload_valid(self): payload = FacebookCommentsPayload( url="https://facebook.com/post/123456", num_of_comments=100 ) - assert payload.num_of_comments == 100 -class TestInstagramPayloads: - """Test Instagram payload dataclasses.""" +# --------------------------------------------------------------------------- +# Instagram +# --------------------------------------------------------------------------- - def test_instagram_profile_payload_valid(self): - """Test valid Instagram profile payload.""" - payload = InstagramProfilePayload(url="https://instagram.com/username") +class TestInstagramPayloads: + def test_profile_payload_valid(self): + payload = InstagramProfilePayload(url="https://instagram.com/username") assert payload.url == "https://instagram.com/username" assert "instagram.com" in payload.domain - def test_instagram_post_payload_valid(self): - """Test valid Instagram post payload.""" + def test_post_payload_valid(self): payload = InstagramPostPayload(url="https://instagram.com/p/ABC123") - assert payload.url == "https://instagram.com/p/ABC123" assert payload.is_post is True - def test_instagram_reel_payload_valid(self): - """Test valid Instagram reel payload.""" + def test_reel_payload_valid(self): payload = InstagramReelPayload(url="https://instagram.com/reel/ABC123") - assert payload.url == "https://instagram.com/reel/ABC123" assert payload.is_reel is True - def test_instagram_posts_discover_payload_valid(self): - """Test valid Instagram posts discover payload.""" + def test_posts_discover_payload_valid(self): payload = InstagramPostsDiscoverPayload( url="https://instagram.com/username", num_of_posts=10, post_type="reel" ) - assert payload.num_of_posts == 10 assert payload.post_type == "reel" - def test_instagram_posts_discover_payload_invalid_count(self): - """Test Instagram discover payload with invalid count.""" + def test_posts_discover_rejects_zero_count(self): with pytest.raises(ValueError, match="num_of_posts must be positive"): InstagramPostsDiscoverPayload(url="https://instagram.com/username", num_of_posts=0) -class TestBasePayload: - """Test base payload functionality.""" +# --------------------------------------------------------------------------- +# Base payload behavior +# --------------------------------------------------------------------------- - def test_url_payload_invalid_type(self): - """Test URL payload with invalid type.""" + +class TestBasePayloadBehavior: + def test_rejects_non_string_url(self): with pytest.raises(TypeError, match="url must be string"): AmazonProductPayload(url=123) # type: ignore - def test_url_payload_empty(self): - """Test URL payload with empty string.""" + def test_rejects_empty_url(self): with pytest.raises(ValueError, match="url cannot be empty"): AmazonProductPayload(url="") - def test_url_payload_no_protocol(self): - """Test URL payload without protocol.""" + def test_rejects_url_without_protocol(self): with pytest.raises(ValueError, match="url must be valid HTTP/HTTPS URL"): AmazonProductPayload(url="amazon.com/dp/B123") - def test_url_payload_properties(self): - """Test URL payload helper properties.""" + def test_url_helper_properties(self): payload = AmazonProductPayload(url="https://amazon.com/dp/B123") - assert payload.domain == "amazon.com" assert payload.is_secure is True - # Test non-HTTPS payload_http = FacebookPostPayload(url="http://facebook.com/post/123") assert payload_http.is_secure is False - def test_to_dict_excludes_none(self): - """Test to_dict() excludes None values.""" - payload = AmazonProductPayload( - url="https://amazon.com/dp/B123", - reviews_count=50, - # images_count not provided (None) - ) - + def test_to_dict_excludes_none_values(self): + payload = AmazonProductPayload(url="https://amazon.com/dp/B123", reviews_count=50) result = payload.to_dict() assert "images_count" not in result assert "reviews_count" in result -class TestPayloadIntegration: - """Integration tests for payload usage.""" +# --------------------------------------------------------------------------- +# Integration +# --------------------------------------------------------------------------- + - def test_payload_lifecycle(self): - """Test complete payload lifecycle.""" - # Create payload with validation +class TestPayloadIntegration: + def test_full_lifecycle(self): payload = LinkedInJobSearchPayload( keyword="python developer", location="New York", remote=True ) - - # Check properties work assert payload.is_remote_search is True - # Convert to dict for API call api_dict = payload.to_dict() assert api_dict["keyword"] == "python developer" assert api_dict["remote"] is True - - # Verify None values excluded assert "url" not in api_dict assert "company" not in api_dict - def test_multiple_payloads_consistency(self): - """Test consistency across different payload types.""" + def test_consistent_interface_across_types(self): payloads = [ AmazonProductPayload(url="https://amazon.com/dp/B123"), LinkedInProfilePayload(url="https://linkedin.com/in/johndoe"), @@ -346,7 +290,6 @@ def test_multiple_payloads_consistency(self): InstagramPostPayload(url="https://instagram.com/p/ABC123"), ] - # All should have consistent interface for payload in payloads: assert hasattr(payload, "url") assert hasattr(payload, "domain") diff --git a/tests/unit/test_retry.py b/tests/unit/test_retry.py index cf6590a..2ee48ae 100644 --- a/tests/unit/test_retry.py +++ b/tests/unit/test_retry.py @@ -1 +1,180 @@ -"""Unit tests for retry logic.""" +"""Tests for utils/retry.py — Exponential backoff logic.""" + +from unittest.mock import AsyncMock, patch + +import pytest + +from brightdata.utils.retry import retry_with_backoff +from brightdata.exceptions import APIError, NetworkError, AuthenticationError, ValidationError + + +# --------------------------------------------------------------------------- +# Happy path +# --------------------------------------------------------------------------- + + +class TestRetrySuccess: + @pytest.mark.asyncio + async def test_returns_on_first_success(self): + func = AsyncMock(return_value="ok") + result = await retry_with_backoff(func, max_retries=3) + assert result == "ok" + assert func.call_count == 1 + + @pytest.mark.asyncio + async def test_retries_then_succeeds(self): + func = AsyncMock(side_effect=[NetworkError("fail"), NetworkError("fail"), "ok"]) + + with patch("brightdata.utils.retry.asyncio.sleep", new_callable=AsyncMock): + result = await retry_with_backoff(func, max_retries=3, initial_delay=0.01) + + assert result == "ok" + assert func.call_count == 3 + + +# --------------------------------------------------------------------------- +# Retryable vs non-retryable exceptions +# --------------------------------------------------------------------------- + + +class TestRetryableExceptions: + @pytest.mark.asyncio + async def test_retries_on_network_error(self): + func = AsyncMock(side_effect=[NetworkError("net"), "ok"]) + + with patch("brightdata.utils.retry.asyncio.sleep", new_callable=AsyncMock): + result = await retry_with_backoff(func, max_retries=3, initial_delay=0.01) + + assert result == "ok" + + @pytest.mark.asyncio + async def test_retries_on_timeout_error(self): + func = AsyncMock(side_effect=[TimeoutError("timeout"), "ok"]) + + with patch("brightdata.utils.retry.asyncio.sleep", new_callable=AsyncMock): + result = await retry_with_backoff(func, max_retries=3, initial_delay=0.01) + + assert result == "ok" + + @pytest.mark.asyncio + async def test_retries_on_api_error(self): + func = AsyncMock(side_effect=[APIError("500", status_code=500), "ok"]) + + with patch("brightdata.utils.retry.asyncio.sleep", new_callable=AsyncMock): + result = await retry_with_backoff(func, max_retries=3, initial_delay=0.01) + + assert result == "ok" + + @pytest.mark.asyncio + async def test_does_not_retry_authentication_error(self): + func = AsyncMock(side_effect=AuthenticationError("bad token")) + + with pytest.raises(AuthenticationError, match="bad token"): + await retry_with_backoff(func, max_retries=3) + + assert func.call_count == 1 + + @pytest.mark.asyncio + async def test_does_not_retry_validation_error(self): + func = AsyncMock(side_effect=ValidationError("bad input")) + + with pytest.raises(ValidationError, match="bad input"): + await retry_with_backoff(func, max_retries=3) + + assert func.call_count == 1 + + @pytest.mark.asyncio + async def test_does_not_retry_value_error(self): + func = AsyncMock(side_effect=ValueError("wrong")) + + with pytest.raises(ValueError): + await retry_with_backoff(func, max_retries=3) + + assert func.call_count == 1 + + @pytest.mark.asyncio + async def test_custom_retryable_exceptions(self): + func = AsyncMock(side_effect=[ValueError("retry me"), "ok"]) + + with patch("brightdata.utils.retry.asyncio.sleep", new_callable=AsyncMock): + result = await retry_with_backoff( + func, + max_retries=3, + retryable_exceptions=[ValueError], + initial_delay=0.01, + ) + + assert result == "ok" + + +# --------------------------------------------------------------------------- +# Exhausted retries +# --------------------------------------------------------------------------- + + +class TestRetryExhausted: + @pytest.mark.asyncio + async def test_raises_last_exception_after_max_retries(self): + func = AsyncMock(side_effect=NetworkError("persistent failure")) + + with patch("brightdata.utils.retry.asyncio.sleep", new_callable=AsyncMock): + with pytest.raises(NetworkError, match="persistent failure"): + await retry_with_backoff(func, max_retries=2, initial_delay=0.01) + + assert func.call_count == 3 # initial + 2 retries + + @pytest.mark.asyncio + async def test_zero_retries_calls_once(self): + func = AsyncMock(side_effect=NetworkError("fail")) + + with pytest.raises(NetworkError): + await retry_with_backoff(func, max_retries=0) + + assert func.call_count == 1 + + +# --------------------------------------------------------------------------- +# Backoff timing +# --------------------------------------------------------------------------- + + +class TestBackoffTiming: + @pytest.mark.asyncio + async def test_exponential_backoff_delays(self): + func = AsyncMock( + side_effect=[NetworkError("1"), NetworkError("2"), NetworkError("3"), "ok"] + ) + sleep_calls = [] + + async def mock_sleep(duration): + sleep_calls.append(duration) + + with patch("brightdata.utils.retry.asyncio.sleep", side_effect=mock_sleep): + result = await retry_with_backoff( + func, max_retries=3, initial_delay=1.0, backoff_factor=2.0 + ) + + assert result == "ok" + assert sleep_calls == [1.0, 2.0, 4.0] + + @pytest.mark.asyncio + async def test_max_delay_cap(self): + func = AsyncMock( + side_effect=[NetworkError("1"), NetworkError("2"), NetworkError("3"), "ok"] + ) + sleep_calls = [] + + async def mock_sleep(duration): + sleep_calls.append(duration) + + with patch("brightdata.utils.retry.asyncio.sleep", side_effect=mock_sleep): + result = await retry_with_backoff( + func, + max_retries=3, + initial_delay=10.0, + backoff_factor=10.0, + max_delay=50.0, + ) + + # Delays: min(10, 50)=10, min(100, 50)=50, min(1000, 50)=50 + assert sleep_calls == [10.0, 50.0, 50.0] diff --git a/tests/unit/test_scrapers.py b/tests/unit/test_scrapers.py deleted file mode 100644 index c999ff7..0000000 --- a/tests/unit/test_scrapers.py +++ /dev/null @@ -1,476 +0,0 @@ -"""Unit tests for base scraper and platform scrapers.""" - -import pytest -from unittest.mock import patch -from brightdata.scrapers import ( - BaseWebScraper, - AmazonScraper, - LinkedInScraper, - ChatGPTScraper, - register, - get_scraper_for, - get_registered_platforms, - is_platform_supported, -) -from brightdata.exceptions import ValidationError - - -class TestBaseWebScraper: - """Test BaseWebScraper abstract base class.""" - - def test_base_scraper_requires_dataset_id(self): - """Test base scraper requires DATASET_ID to be defined.""" - - class TestScraper(BaseWebScraper): - # Missing DATASET_ID - pass - - with pytest.raises(NotImplementedError) as exc_info: - TestScraper(bearer_token="test_token_123456789") - - assert "DATASET_ID" in str(exc_info.value) - - def test_base_scraper_requires_token(self): - """Test base scraper requires bearer token.""" - - class TestScraper(BaseWebScraper): - DATASET_ID = "test_dataset_123" - - with patch.dict("os.environ", {}, clear=True): - with pytest.raises(ValidationError) as exc_info: - TestScraper() - - assert "token" in str(exc_info.value).lower() - - def test_base_scraper_accepts_token_from_env(self): - """Test base scraper loads token from environment.""" - - class TestScraper(BaseWebScraper): - DATASET_ID = "test_dataset_123" - PLATFORM_NAME = "test" - - with patch.dict("os.environ", {"BRIGHTDATA_API_TOKEN": "env_token_123456789"}): - scraper = TestScraper() - assert scraper.bearer_token == "env_token_123456789" - - def test_base_scraper_has_required_attributes(self): - """Test base scraper has all required class attributes.""" - - class TestScraper(BaseWebScraper): - DATASET_ID = "test_123" - PLATFORM_NAME = "test" - - scraper = TestScraper(bearer_token="test_token_123456789") - - assert hasattr(scraper, "DATASET_ID") - assert hasattr(scraper, "PLATFORM_NAME") - assert hasattr(scraper, "MIN_POLL_TIMEOUT") - assert hasattr(scraper, "COST_PER_RECORD") - assert hasattr(scraper, "engine") - - def test_base_scraper_has_scrape_methods(self): - """Test base scraper has scrape methods (async-first API).""" - - class TestScraper(BaseWebScraper): - DATASET_ID = "test_123" - - scraper = TestScraper(bearer_token="test_token_123456789") - - assert hasattr(scraper, "scrape") - assert callable(scraper.scrape) - - def test_base_scraper_has_normalize_result_method(self): - """Test base scraper has normalize_result method.""" - - class TestScraper(BaseWebScraper): - DATASET_ID = "test_123" - - scraper = TestScraper(bearer_token="test_token_123456789") - - # Should return data as-is by default - test_data = {"key": "value"} - normalized = scraper.normalize_result(test_data) - assert normalized == test_data - - def test_base_scraper_repr(self): - """Test base scraper string representation.""" - - class TestScraper(BaseWebScraper): - DATASET_ID = "test_dataset_123" - PLATFORM_NAME = "testplatform" - - scraper = TestScraper(bearer_token="test_token_123456789") - repr_str = repr(scraper) - - assert "testplatform" in repr_str.lower() - assert "test_dataset_123" in repr_str - - -class TestRegistryPattern: - """Test registry pattern and auto-discovery.""" - - def test_register_decorator_works(self): - """Test @register decorator adds scraper to registry.""" - - @register("testplatform") - class TestScraper(BaseWebScraper): - DATASET_ID = "test_123" - PLATFORM_NAME = "testplatform" - - # Should be in registry - scraper_class = get_scraper_for("https://testplatform.com/page") - assert scraper_class is TestScraper - - def test_get_scraper_for_amazon_url(self): - """Test get_scraper_for returns AmazonScraper for Amazon URLs.""" - scraper_class = get_scraper_for("https://www.amazon.com/dp/B123") - assert scraper_class is AmazonScraper - - def test_get_scraper_for_linkedin_url(self): - """Test get_scraper_for returns LinkedInScraper for LinkedIn URLs.""" - scraper_class = get_scraper_for("https://linkedin.com/in/johndoe") - assert scraper_class is LinkedInScraper - - def test_get_scraper_for_chatgpt_url(self): - """Test get_scraper_for returns ChatGPTScraper for ChatGPT URLs.""" - scraper_class = get_scraper_for("https://chatgpt.com/c/abc123") - assert scraper_class is ChatGPTScraper - - def test_get_scraper_for_unknown_domain_returns_none(self): - """Test get_scraper_for returns None for unknown domains.""" - scraper_class = get_scraper_for("https://unknown-domain-xyz.com/page") - assert scraper_class is None - - def test_get_registered_platforms(self): - """Test get_registered_platforms returns all registered platforms.""" - platforms = get_registered_platforms() - - assert isinstance(platforms, list) - assert "amazon" in platforms - assert "linkedin" in platforms - assert "chatgpt" in platforms - - def test_is_platform_supported_for_known_platform(self): - """Test is_platform_supported returns True for known platforms.""" - assert is_platform_supported("https://amazon.com/dp/B123") is True - assert is_platform_supported("https://linkedin.com/in/john") is True - - def test_is_platform_supported_for_unknown_platform(self): - """Test is_platform_supported returns False for unknown platforms.""" - assert is_platform_supported("https://unknown.com/page") is False - - -class TestAmazonScraper: - """Test AmazonScraper platform-specific features.""" - - def test_amazon_scraper_has_correct_attributes(self): - """Test AmazonScraper has correct dataset ID and platform name.""" - scraper = AmazonScraper(bearer_token="test_token_123456789") - - assert scraper.PLATFORM_NAME == "amazon" - assert scraper.DATASET_ID == "gd_l7q7dkf244hwjntr0" - assert scraper.MIN_POLL_TIMEOUT == 240 - assert scraper.COST_PER_RECORD == 0.001 # Uses DEFAULT_COST_PER_RECORD - - def test_amazon_scraper_has_products_method(self): - """Test AmazonScraper has products search method (async-first API).""" - scraper = AmazonScraper(bearer_token="test_token_123456789") - - assert hasattr(scraper, "products") - assert callable(scraper.products) - - def test_amazon_scraper_has_reviews_method(self): - """Test AmazonScraper has reviews method (async-first API).""" - scraper = AmazonScraper(bearer_token="test_token_123456789") - - assert hasattr(scraper, "reviews") - assert callable(scraper.reviews) - - def test_amazon_scraper_registered_in_registry(self): - """Test AmazonScraper is registered for 'amazon' domain.""" - scraper_class = get_scraper_for("https://amazon.com/dp/B123") - assert scraper_class is AmazonScraper - - -class TestLinkedInScraper: - """Test LinkedInScraper platform-specific features.""" - - def test_linkedin_scraper_has_correct_attributes(self): - """Test LinkedInScraper has correct dataset IDs.""" - scraper = LinkedInScraper(bearer_token="test_token_123456789") - - assert scraper.PLATFORM_NAME == "linkedin" - assert scraper.DATASET_ID.startswith("gd_") # People profiles - assert hasattr(scraper, "DATASET_ID_COMPANIES") - assert hasattr(scraper, "DATASET_ID_JOBS") - - def test_linkedin_scraper_has_profiles_method(self): - """Test LinkedInScraper has profiles search method (async-first API).""" - scraper = LinkedInScraper(bearer_token="test_token_123456789") - - assert hasattr(scraper, "profiles") - assert callable(scraper.profiles) - - def test_linkedin_scraper_has_companies_method(self): - """Test LinkedInScraper has companies search method (async-first API).""" - scraper = LinkedInScraper(bearer_token="test_token_123456789") - - assert hasattr(scraper, "companies") - assert callable(scraper.companies) - - def test_linkedin_scraper_has_jobs_method(self): - """Test LinkedInScraper has jobs search method (async-first API).""" - scraper = LinkedInScraper(bearer_token="test_token_123456789") - - assert hasattr(scraper, "jobs") - assert callable(scraper.jobs) - - def test_linkedin_scraper_registered_in_registry(self): - """Test LinkedInScraper is registered for 'linkedin' domain.""" - scraper_class = get_scraper_for("https://linkedin.com/in/john") - assert scraper_class is LinkedInScraper - - -class TestChatGPTScraper: - """Test ChatGPTScraper platform-specific features.""" - - def test_chatgpt_scraper_has_correct_attributes(self): - """Test ChatGPTScraper has correct dataset ID.""" - scraper = ChatGPTScraper(bearer_token="test_token_123456789") - - assert scraper.PLATFORM_NAME == "chatgpt" - assert scraper.DATASET_ID.startswith("gd_") - - def test_chatgpt_scraper_has_prompt_method(self): - """Test ChatGPTScraper has prompt method (async-first API).""" - scraper = ChatGPTScraper(bearer_token="test_token_123456789") - - assert hasattr(scraper, "prompt") - assert callable(scraper.prompt) - - def test_chatgpt_scraper_has_prompts_method(self): - """Test ChatGPTScraper has prompts (batch) method (async-first API).""" - scraper = ChatGPTScraper(bearer_token="test_token_123456789") - - assert hasattr(scraper, "prompts") - assert callable(scraper.prompts) - - def test_chatgpt_scraper_scrape_raises_not_implemented(self): - """Test ChatGPTScraper raises NotImplementedError for scrape().""" - import asyncio - - scraper = ChatGPTScraper(bearer_token="test_token_123456789") - - async def test_scrape(): - with pytest.raises(NotImplementedError) as exc_info: - await scraper.scrape("https://chatgpt.com/") - assert "doesn't support URL-based scraping" in str(exc_info.value) - assert "Use prompt()" in str(exc_info.value) - - asyncio.run(test_scrape()) - - def test_chatgpt_scraper_registered_in_registry(self): - """Test ChatGPTScraper is registered for 'chatgpt' domain.""" - scraper_class = get_scraper_for("https://chatgpt.com/c/123") - assert scraper_class is ChatGPTScraper - - -class TestScrapeVsSearchDistinction: - """Test clear distinction between scrape and search methods.""" - - def test_scrape_methods_are_url_based(self): - """Test scrape() methods accept URLs.""" - scraper = AmazonScraper(bearer_token="test_token_123456789") - - # scrape() should accept URL - assert hasattr(scraper, "scrape") - # Method signature should accept urls parameter - import inspect - - sig = inspect.signature(scraper.scrape) - assert "urls" in sig.parameters - - def test_search_methods_are_parameter_based(self): - """Test search methods (discovery) accept keywords/parameters.""" - # Search methods are in search services, not scrapers - # Scrapers are now URL-based only per API spec - - from brightdata.scrapers.linkedin import LinkedInSearchScraper - - linkedin_search = LinkedInSearchScraper(bearer_token="test_token_123456789") - - import inspect - - # LinkedIn search jobs() should accept keyword (parameter-based discovery) - jobs_sig = inspect.signature(linkedin_search.jobs) - assert "keyword" in jobs_sig.parameters - - # LinkedIn search profiles() should accept first_name (parameter-based discovery) - profiles_sig = inspect.signature(linkedin_search.profiles) - assert "first_name" in profiles_sig.parameters - - # LinkedIn search posts() should accept url (parameter-based discovery) - posts_sig = inspect.signature(linkedin_search.posts) - assert "url" in posts_sig.parameters - - def test_all_platform_scrapers_have_scrape(self): - """Test all platform scrapers have scrape() method.""" - scrapers = [ - AmazonScraper(bearer_token="test_token_123456789"), - LinkedInScraper(bearer_token="test_token_123456789"), - # ChatGPT is exception - it overrides to raise NotImplementedError - ] - - for scraper in scrapers: - assert hasattr(scraper, "scrape") - assert callable(scraper.scrape) - - def test_platforms_have_all_methods(self): - """Test all platforms have their methods (async-first API).""" - amazon = AmazonScraper(bearer_token="test_token_123456789") - linkedin = LinkedInScraper(bearer_token="test_token_123456789") - - # Amazon - all URL-based scrape methods - assert hasattr(amazon, "products") and callable(amazon.products) - assert hasattr(amazon, "reviews") and callable(amazon.reviews) - assert hasattr(amazon, "sellers") and callable(amazon.sellers) - - # LinkedIn - URL-based scrape methods - assert hasattr(linkedin, "posts") and callable(linkedin.posts) - assert hasattr(linkedin, "jobs") and callable(linkedin.jobs) - assert hasattr(linkedin, "profiles") and callable(linkedin.profiles) - assert hasattr(linkedin, "companies") and callable(linkedin.companies) - - -class TestClientIntegration: - """Test scrapers integrate with BrightDataClient.""" - - def test_scrapers_accessible_through_client(self): - """Test scrapers are accessible through client.scrape namespace.""" - from brightdata import BrightDataClient - - client = BrightDataClient(token="test_token_123456789") - - # All scrapers should be accessible - assert hasattr(client.scrape, "amazon") - assert hasattr(client.scrape, "linkedin") - assert hasattr(client.scrape, "chatgpt") - - def test_client_scraper_access_returns_correct_instances(self): - """Test client returns correct scraper instances.""" - from brightdata import BrightDataClient - - client = BrightDataClient(token="test_token_123456789") - - amazon = client.scrape.amazon - assert isinstance(amazon, AmazonScraper) - assert amazon.PLATFORM_NAME == "amazon" - - linkedin = client.scrape.linkedin - assert isinstance(linkedin, LinkedInScraper) - assert linkedin.PLATFORM_NAME == "linkedin" - - chatgpt = client.scrape.chatgpt - assert isinstance(chatgpt, ChatGPTScraper) - assert chatgpt.PLATFORM_NAME == "chatgpt" - - def test_client_passes_token_to_scrapers(self): - """Test client passes its token to scraper instances.""" - from brightdata import BrightDataClient - - token = "test_token_123456789" - client = BrightDataClient(token=token) - - amazon = client.scrape.amazon - assert amazon.bearer_token == token - - -class TestInterfaceConsistency: - """Test interface consistency across platforms.""" - - def test_amazon_interface_matches_spec(self): - """Test Amazon scraper matches interface specification.""" - scraper = AmazonScraper(bearer_token="test_token_123456789") - - # URL-based scraping - assert hasattr(scraper, "scrape") - - # Parameter-based search - assert hasattr(scraper, "products") - assert hasattr(scraper, "reviews") - - def test_linkedin_interface_matches_spec(self): - """Test LinkedIn scraper matches interface specification.""" - scraper = LinkedInScraper(bearer_token="test_token_123456789") - - # URL-based scraping - assert hasattr(scraper, "scrape") - - # Parameter-based search - assert hasattr(scraper, "profiles") - assert hasattr(scraper, "companies") - assert hasattr(scraper, "jobs") - - def test_chatgpt_interface_matches_spec(self): - """Test ChatGPT scraper matches interface specification.""" - import asyncio - - scraper = ChatGPTScraper(bearer_token="test_token_123456789") - - # Prompt-based (ChatGPT specific) - assert hasattr(scraper, "prompt") - assert hasattr(scraper, "prompts") - - # scrape() should raise NotImplementedError (async method) - async def test_scrape(): - with pytest.raises(NotImplementedError): - await scraper.scrape("https://chatgpt.com/") - - asyncio.run(test_scrape()) - - -class TestPhilosophicalPrinciples: - """Test scrapers follow philosophical principles.""" - - def test_platforms_feel_familiar(self): - """Test platforms have similar interfaces (familiarity).""" - amazon = AmazonScraper(bearer_token="test_token_123456789") - linkedin = LinkedInScraper(bearer_token="test_token_123456789") - - # Both should have scrape() method (async-first API) - assert hasattr(amazon, "scrape") - assert hasattr(linkedin, "scrape") - assert callable(amazon.scrape) - assert callable(linkedin.scrape) - - def test_scrape_vs_search_is_clear(self): - """Test scrape vs search distinction is clear.""" - amazon = AmazonScraper(bearer_token="test_token_123456789") - - import inspect - - # Amazon products() is now URL-based scraping (not search) - products_sig = inspect.signature(amazon.products) - assert "url" in products_sig.parameters - assert "sync" not in products_sig.parameters # sync parameter was removed - - # For search methods, check LinkedInSearchScraper - from brightdata.scrapers.linkedin import LinkedInSearchScraper - - linkedin_search = LinkedInSearchScraper(bearer_token="test_token_123456789") - - # Search jobs() signature = parameter-based (has keyword, not url required) - jobs_sig = inspect.signature(linkedin_search.jobs) - assert "keyword" in jobs_sig.parameters - - def test_architecture_supports_future_auto_routing(self): - """Test architecture is ready for future auto-routing.""" - # Registry pattern enables auto-routing - amazon_url = "https://amazon.com/dp/B123" - scraper_class = get_scraper_for(amazon_url) - - assert scraper_class is not None - assert scraper_class is AmazonScraper - - # This enables future: client.scrape.auto(url) - # The infrastructure is in place! diff --git a/tests/unit/test_serp.py b/tests/unit/test_serp.py deleted file mode 100644 index 53f5a92..0000000 --- a/tests/unit/test_serp.py +++ /dev/null @@ -1,507 +0,0 @@ -"""Unit tests for SERP service.""" - -from brightdata.api.serp import ( - BaseSERPService, - GoogleSERPService, - BingSERPService, - YandexSERPService, -) - - -class TestBaseSERPService: - """Test base SERP service functionality.""" - - def test_base_serp_has_search_engine_attribute(self): - """Test base SERP service has SEARCH_ENGINE attribute.""" - assert hasattr(BaseSERPService, "SEARCH_ENGINE") - assert hasattr(BaseSERPService, "ENDPOINT") - - def test_base_serp_has_search_methods(self): - """Test base SERP service has search methods (async-first API).""" - from brightdata.core.engine import AsyncEngine - - engine = AsyncEngine("test_token_123456789") - service = GoogleSERPService(engine) - - assert hasattr(service, "search") - assert callable(service.search) - - def test_base_serp_has_data_normalizer(self): - """Test base SERP has data_normalizer.""" - from brightdata.core.engine import AsyncEngine - - engine = AsyncEngine("test_token_123456789") - service = GoogleSERPService(engine) - - assert hasattr(service, "data_normalizer") - assert hasattr(service.data_normalizer, "normalize") - assert callable(service.data_normalizer.normalize) - - -class TestGoogleSERPService: - """Test Google SERP service.""" - - def test_google_serp_has_correct_engine_name(self): - """Test Google SERP service has correct search engine name.""" - assert GoogleSERPService.SEARCH_ENGINE == "google" - - def test_google_serp_build_search_url(self): - """Test Google search URL building.""" - from brightdata.core.engine import AsyncEngine - - engine = AsyncEngine("test_token_123456789") - service = GoogleSERPService(engine) - - url = service.url_builder.build( - query="python tutorial", - location="United States", - language="en", - device="desktop", - num_results=10, - ) - - assert "google.com/search" in url - assert "q=python+tutorial" in url or "q=python%20tutorial" in url - assert "num=10" in url - assert "hl=en" in url - assert "gl=" in url # Location code - - def test_google_serp_url_encoding(self): - """Test Google search query encoding.""" - from brightdata.core.engine import AsyncEngine - - engine = AsyncEngine("test_token_123456789") - service = GoogleSERPService(engine) - - url = service.url_builder.build( - query="python & javascript", - location=None, - language="en", - device="desktop", - num_results=10, - ) - - # Should encode special characters - assert "google.com/search" in url - assert "+" in url or "%20" in url # Space encoded - - def test_google_serp_location_parsing(self): - """Test location name to country code parsing.""" - from brightdata.utils.location import LocationService, LocationFormat - - # Test country name mappings - assert LocationService.parse_location("United States", LocationFormat.GOOGLE) == "us" - assert LocationService.parse_location("United Kingdom", LocationFormat.GOOGLE) == "gb" - assert LocationService.parse_location("Canada", LocationFormat.GOOGLE) == "ca" - - # Test direct codes - assert LocationService.parse_location("US", LocationFormat.GOOGLE) == "us" - assert LocationService.parse_location("GB", LocationFormat.GOOGLE) == "gb" - - def test_google_serp_normalize_data(self): - """Test Google SERP data normalization.""" - from brightdata.core.engine import AsyncEngine - - engine = AsyncEngine("test_token_123456789") - service = GoogleSERPService(engine) - - # Test with structured data - raw_data = { - "organic": [ - { - "title": "Python Tutorial", - "url": "https://python.org/tutorial", - "description": "Learn Python", - }, - { - "title": "Advanced Python", - "url": "https://example.com/advanced", - "description": "Advanced topics", - }, - ], - "total_results": 1000000, - } - - normalized = service.data_normalizer.normalize(raw_data) - - assert "results" in normalized - assert len(normalized["results"]) == 2 - assert normalized["results"][0]["position"] == 1 - assert normalized["results"][0]["title"] == "Python Tutorial" - assert normalized["results"][1]["position"] == 2 - - def test_google_serp_normalize_empty_data(self): - """Test Google SERP normalization with empty data.""" - from brightdata.core.engine import AsyncEngine - - engine = AsyncEngine("test_token_123456789") - service = GoogleSERPService(engine) - - # Normalization is done via data_normalizer attribute - normalized = service.data_normalizer.normalize({}) - assert "results" in normalized - assert normalized["results"] == [] - - -class TestBingSERPService: - """Test Bing SERP service.""" - - def test_bing_serp_has_correct_engine_name(self): - """Test Bing SERP service has correct search engine name.""" - assert BingSERPService.SEARCH_ENGINE == "bing" - - def test_bing_serp_build_search_url(self): - """Test Bing search URL building.""" - from brightdata.core.engine import AsyncEngine - - engine = AsyncEngine("test_token_123456789") - service = BingSERPService(engine) - - url = service.url_builder.build( - query="python tutorial", - location="United States", - language="en", - device="desktop", - num_results=10, - ) - - assert "bing.com/search" in url - assert "q=python" in url - assert "count=10" in url - - -class TestYandexSERPService: - """Test Yandex SERP service.""" - - def test_yandex_serp_has_correct_engine_name(self): - """Test Yandex SERP service has correct search engine name.""" - assert YandexSERPService.SEARCH_ENGINE == "yandex" - - def test_yandex_serp_build_search_url(self): - """Test Yandex search URL building.""" - from brightdata.core.engine import AsyncEngine - - engine = AsyncEngine("test_token_123456789") - service = YandexSERPService(engine) - - url = service.url_builder.build( - query="python tutorial", - location="Russia", - language="ru", - device="desktop", - num_results=10, - ) - - assert "yandex.com/search" in url - assert "text=python" in url - assert "numdoc=10" in url - - -class TestSERPNormalization: - """Test SERP data normalization across engines.""" - - def test_normalized_results_have_position(self): - """Test normalized results include ranking position.""" - from brightdata.core.engine import AsyncEngine - - engine = AsyncEngine("test_token_123456789") - service = GoogleSERPService(engine) - - raw_data = { - "organic": [ - {"title": "Result 1", "url": "https://example1.com", "description": "Desc 1"}, - {"title": "Result 2", "url": "https://example2.com", "description": "Desc 2"}, - ] - } - - normalized = service.data_normalizer.normalize(raw_data) - - # Each result should have position starting from 1 - for i, result in enumerate(normalized["results"], 1): - assert result["position"] == i - - def test_normalized_results_have_required_fields(self): - """Test normalized results have required fields.""" - from brightdata.core.engine import AsyncEngine - - engine = AsyncEngine("test_token_123456789") - service = GoogleSERPService(engine) - - raw_data = { - "organic": [ - {"title": "Test", "url": "https://test.com", "description": "Test desc"}, - ] - } - - normalized = service.data_normalizer.normalize(raw_data) - result = normalized["results"][0] - - # Required fields - assert "position" in result - assert "title" in result - assert "url" in result - assert "description" in result - - -class TestClientIntegration: - """Test SERP services integrate with BrightDataClient.""" - - def test_search_service_accessible_through_client(self): - """Test search service is accessible via client.search.""" - from brightdata import BrightDataClient - - client = BrightDataClient(token="test_token_123456789") - - assert hasattr(client, "search") - assert client.search is not None - - def test_search_service_has_google_method(self): - """Test search service has google() method (async-first API).""" - from brightdata import BrightDataClient - - client = BrightDataClient(token="test_token_123456789") - - assert hasattr(client.search, "google") - assert callable(client.search.google) - - def test_search_service_has_bing_method(self): - """Test search service has bing() method (async-first API).""" - from brightdata import BrightDataClient - - client = BrightDataClient(token="test_token_123456789") - - assert hasattr(client.search, "bing") - assert callable(client.search.bing) - - def test_search_service_has_yandex_method(self): - """Test search service has yandex() method (async-first API).""" - from brightdata import BrightDataClient - - client = BrightDataClient(token="test_token_123456789") - - assert hasattr(client.search, "yandex") - assert callable(client.search.yandex) - - -class TestSERPInterfaceConsistency: - """Test interface consistency across search engines.""" - - def test_all_engines_have_same_signature(self): - """Test all search engines have consistent method signatures.""" - from brightdata import BrightDataClient - import inspect - - client = BrightDataClient(token="test_token_123456789") - - # Get signatures - google_sig = inspect.signature(client.search.google) - bing_sig = inspect.signature(client.search.bing) - yandex_sig = inspect.signature(client.search.yandex) - - # All should have 'query' parameter - assert "query" in google_sig.parameters - assert "query" in bing_sig.parameters - assert "query" in yandex_sig.parameters - - def test_all_engines_return_search_result(self): - """Test all engines return SearchResult type.""" - from brightdata import BrightDataClient - import inspect - - client = BrightDataClient(token="test_token_123456789") - - # Check return type hints if available (async-first API) - google_sig = inspect.signature(client.search.google) - # Return annotation should mention SearchResult or List[SearchResult] - if google_sig.return_annotation != inspect.Signature.empty: - assert "SearchResult" in str(google_sig.return_annotation) - - -class TestPhilosophicalPrinciples: - """Test SERP service follows philosophical principles.""" - - def test_serp_data_normalized_across_engines(self): - """Test SERP data is normalized for easy comparison.""" - from brightdata.core.engine import AsyncEngine - - engine = AsyncEngine("test_token_123456789") - - # Same raw data structure - raw_data = { - "organic": [ - {"title": "Result", "url": "https://example.com", "description": "Desc"}, - ], - "total_results": 1000, - } - - # Both engines should normalize to same format - google_service = GoogleSERPService(engine) - google_normalized = google_service.data_normalizer.normalize(raw_data) - - # Normalized format should have: - assert "results" in google_normalized - assert "total_results" in google_normalized - assert isinstance(google_normalized["results"], list) - - def test_search_engine_quirks_handled_transparently(self): - """Test search engine specific quirks are abstracted away.""" - from brightdata.core.engine import AsyncEngine - - engine = AsyncEngine("test_token_123456789") - - # Different engines have different URL patterns - google = GoogleSERPService(engine) - bing = BingSERPService(engine) - yandex = YandexSERPService(engine) - - # But all build URLs transparently - google_url = google.url_builder.build("test", None, "en", "desktop", 10) - bing_url = bing.url_builder.build("test", None, "en", "desktop", 10) - yandex_url = yandex.url_builder.build("test", None, "ru", "desktop", 10) - - # Each should have their engine's domain - assert "google.com" in google_url - assert "bing.com" in bing_url - assert "yandex.com" in yandex_url - - # But query is present in all - assert "test" in google_url - assert "test" in bing_url - assert "test" in yandex_url - - def test_results_include_ranking_position(self): - """Test results include ranking position for competitive analysis.""" - from brightdata.core.engine import AsyncEngine - - engine = AsyncEngine("test_token_123456789") - service = GoogleSERPService(engine) - - raw_data = { - "organic": [ - {"title": "First", "url": "https://1.com", "description": "D1"}, - {"title": "Second", "url": "https://2.com", "description": "D2"}, - {"title": "Third", "url": "https://3.com", "description": "D3"}, - ] - } - - normalized = service.data_normalizer.normalize(raw_data) - - # Positions should be 1, 2, 3 - positions = [r["position"] for r in normalized["results"]] - assert positions == [1, 2, 3] - - -class TestSERPFeatureExtraction: - """Test SERP feature detection and extraction.""" - - def test_extract_featured_snippet(self): - """Test extraction of featured snippet.""" - from brightdata.core.engine import AsyncEngine - - engine = AsyncEngine("test_token_123456789") - service = GoogleSERPService(engine) - - raw_data = { - "organic": [], - "featured_snippet": { - "title": "What is Python?", - "description": "Python is a programming language...", - "url": "https://python.org", - }, - } - - normalized = service.data_normalizer.normalize(raw_data) - - assert "featured_snippet" in normalized - assert normalized["featured_snippet"]["title"] == "What is Python?" - - def test_extract_knowledge_panel(self): - """Test extraction of knowledge panel.""" - from brightdata.core.engine import AsyncEngine - - engine = AsyncEngine("test_token_123456789") - service = GoogleSERPService(engine) - - raw_data = { - "organic": [], - "knowledge_panel": { - "title": "Python", - "type": "Programming Language", - "description": "High-level programming language", - }, - } - - normalized = service.data_normalizer.normalize(raw_data) - - assert "knowledge_panel" in normalized - assert normalized["knowledge_panel"]["title"] == "Python" - - def test_extract_people_also_ask(self): - """Test extraction of People Also Ask section.""" - from brightdata.core.engine import AsyncEngine - - engine = AsyncEngine("test_token_123456789") - service = GoogleSERPService(engine) - - raw_data = { - "organic": [], - "people_also_ask": [ - {"question": "What is Python used for?", "answer": "..."}, - {"question": "Is Python easy to learn?", "answer": "..."}, - ], - } - - normalized = service.data_normalizer.normalize(raw_data) - - assert "people_also_ask" in normalized - assert len(normalized["people_also_ask"]) == 2 - - -class TestLocationLanguageSupport: - """Test location and language-specific search support.""" - - def test_google_supports_location(self): - """Test Google search supports location parameter.""" - from brightdata.core.engine import AsyncEngine - - engine = AsyncEngine("test_token_123456789") - service = GoogleSERPService(engine) - - url = service.url_builder.build( - query="restaurants", - location="New York", - language="en", - device="desktop", - num_results=10, - ) - - # Should have location parameter - assert "gl=" in url - - def test_google_supports_language(self): - """Test Google search supports language parameter.""" - from brightdata.core.engine import AsyncEngine - - engine = AsyncEngine("test_token_123456789") - service = GoogleSERPService(engine) - - url_en = service.url_builder.build("test", None, "en", "desktop", 10) - url_es = service.url_builder.build("test", None, "es", "desktop", 10) - url_fr = service.url_builder.build("test", None, "fr", "desktop", 10) - - assert "hl=en" in url_en - assert "hl=es" in url_es - assert "hl=fr" in url_fr - - def test_google_supports_device_types(self): - """Test Google search supports device type parameter.""" - from brightdata.core.engine import AsyncEngine - - engine = AsyncEngine("test_token_123456789") - service = GoogleSERPService(engine) - - service.url_builder.build("test", None, "en", "desktop", 10) - url_mobile = service.url_builder.build("test", None, "en", "mobile", 10) - - # Mobile should have mobile-specific parameter - assert "mobile" in url_mobile.lower() or "mobileaction" in url_mobile diff --git a/tests/unit/test_ssl_helpers.py b/tests/unit/test_ssl_helpers.py index 224db1b..3f2fe4f 100644 --- a/tests/unit/test_ssl_helpers.py +++ b/tests/unit/test_ssl_helpers.py @@ -1,226 +1,178 @@ -"""Unit tests for SSL error handling utilities.""" +"""Tests for utils/ssl_helpers.py — SSL error detection and messages.""" import ssl from unittest.mock import Mock, patch + from brightdata.utils.ssl_helpers import is_macos, is_ssl_certificate_error, get_ssl_error_message -class TestPlatformDetection: - """Test platform detection utilities.""" +# --------------------------------------------------------------------------- +# Platform detection +# --------------------------------------------------------------------------- - def test_is_macos_returns_boolean(self): - """Test is_macos returns a boolean.""" - result = is_macos() - assert isinstance(result, bool) + +class TestPlatformDetection: + def test_returns_boolean(self): + assert isinstance(is_macos(), bool) @patch("sys.platform", "darwin") - def test_is_macos_true_on_darwin(self): - """Test is_macos returns True on darwin platform.""" - result = is_macos() - assert result is True + def test_true_on_darwin(self): + assert is_macos() is True @patch("sys.platform", "linux") - def test_is_macos_false_on_linux(self): - """Test is_macos returns False on linux.""" - result = is_macos() - assert result is False + def test_false_on_linux(self): + assert is_macos() is False @patch("sys.platform", "win32") - def test_is_macos_false_on_windows(self): - """Test is_macos returns False on Windows.""" - result = is_macos() - assert result is False + def test_false_on_windows(self): + assert is_macos() is False + + +# --------------------------------------------------------------------------- +# SSL certificate error detection +# --------------------------------------------------------------------------- class TestSSLCertificateErrorDetection: - """Test SSL certificate error detection.""" + def test_ssl_error_detected(self): + assert is_ssl_certificate_error(ssl.SSLError("certificate verify failed")) is True - def test_ssl_error_is_detected(self): - """Test SSL errors are detected.""" - error = ssl.SSLError("certificate verify failed") - assert is_ssl_certificate_error(error) is True + def test_oserror_with_ssl_keywords_detected(self): + assert is_ssl_certificate_error(OSError("SSL certificate verification failed")) is True - def test_oserror_with_ssl_keywords_is_detected(self): - """Test OSError with SSL keywords is detected.""" - error = OSError("SSL certificate verification failed") - assert is_ssl_certificate_error(error) is True + def test_oserror_with_certificate_keyword_detected(self): + assert is_ssl_certificate_error(OSError("unable to get local issuer certificate")) is True - def test_oserror_with_certificate_keyword_is_detected(self): - """Test OSError with 'certificate' keyword is detected.""" - error = OSError("unable to get local issuer certificate") - assert is_ssl_certificate_error(error) is True + def test_generic_exception_with_ssl_message_detected(self): + assert is_ssl_certificate_error(Exception("[SSL: CERTIFICATE_VERIFY_FAILED]")) is True - def test_generic_exception_with_ssl_message_is_detected(self): - """Test generic exception with SSL message is detected.""" - error = Exception("[SSL: CERTIFICATE_VERIFY_FAILED]") - assert is_ssl_certificate_error(error) is True + def test_certificate_verify_failed_detected(self): + assert is_ssl_certificate_error(Exception("certificate verify failed")) is True - def test_exception_with_certificate_verify_failed(self): - """Test exception with 'certificate verify failed' is detected.""" - error = Exception("certificate verify failed") - assert is_ssl_certificate_error(error) is True + def test_non_ssl_error_not_detected(self): + assert is_ssl_certificate_error(ValueError("Invalid value")) is False - def test_non_ssl_error_is_not_detected(self): - """Test non-SSL errors are not detected.""" - error = ValueError("Invalid value") - assert is_ssl_certificate_error(error) is False + def test_connection_error_without_ssl_not_detected(self): + assert is_ssl_certificate_error(ConnectionError("Connection refused")) is False - def test_connection_error_without_ssl_is_not_detected(self): - """Test connection errors without SSL keywords are not detected.""" - error = ConnectionError("Connection refused") - assert is_ssl_certificate_error(error) is False + def test_timeout_error_not_detected(self): + assert is_ssl_certificate_error(TimeoutError("Operation timed out")) is False - def test_timeout_error_is_not_detected(self): - """Test timeout errors are not detected as SSL errors.""" - error = TimeoutError("Operation timed out") - assert is_ssl_certificate_error(error) is False +# --------------------------------------------------------------------------- +# SSL error messages +# --------------------------------------------------------------------------- -class TestSSLErrorMessage: - """Test SSL error message generation.""" +class TestSSLErrorMessage: @patch("brightdata.utils.ssl_helpers.is_macos", return_value=True) - def test_macos_error_message_includes_platform_specific_fixes(self, mock_is_macos): - """Test macOS error message includes platform-specific fixes.""" - error = ssl.SSLError("certificate verify failed") - message = get_ssl_error_message(error) + def test_macos_includes_platform_specific_fixes(self, _): + message = get_ssl_error_message(ssl.SSLError("certificate verify failed")) - # Should include base message assert "SSL certificate verification failed" in message assert "macOS" in message - - # Should include macOS-specific fixes assert "Install Certificates.command" in message assert "Homebrew" in message assert "certifi" in message assert "SSL_CERT_FILE" in message @patch("brightdata.utils.ssl_helpers.is_macos", return_value=False) - def test_non_macos_error_message_excludes_macos_specific_fixes(self, mock_is_macos): - """Test non-macOS error message excludes macOS-specific fixes.""" - error = ssl.SSLError("certificate verify failed") - message = get_ssl_error_message(error) + def test_non_macos_excludes_macos_fixes(self, _): + message = get_ssl_error_message(ssl.SSLError("certificate verify failed")) - # Should include base message assert "SSL certificate verification failed" in message - - # Should NOT include macOS-specific fixes assert "Install Certificates.command" not in message assert "Homebrew" not in message - - # Should include generic fixes assert "certifi" in message assert "SSL_CERT_FILE" in message - def test_error_message_includes_original_error(self): - """Test error message includes original error.""" - error = ssl.SSLError("specific error details") - message = get_ssl_error_message(error) - + def test_includes_original_error(self): + message = get_ssl_error_message(ssl.SSLError("specific error details")) assert "Original error:" in message assert "specific error details" in message - def test_error_message_includes_fix_instructions(self): - """Test error message includes fix instructions.""" - error = ssl.SSLError("certificate verify failed") - message = get_ssl_error_message(error) - - # Should include pip install command + def test_includes_fix_instructions(self): + message = get_ssl_error_message(ssl.SSLError("certificate verify failed")) assert "pip install" in message assert "certifi" in message - - # Should include SSL_CERT_FILE command assert "export SSL_CERT_FILE" in message assert "python -m certifi" in message - def test_error_message_includes_documentation_link(self): - """Test error message includes documentation link.""" - error = ssl.SSLError("certificate verify failed") - message = get_ssl_error_message(error) - - # Should include link to troubleshooting docs + def test_includes_documentation_link(self): + message = get_ssl_error_message(ssl.SSLError("certificate verify failed")) assert "docs/troubleshooting" in message or "troubleshooting.md" in message -class TestSSLErrorMessageFormats: - """Test SSL error message handles different error formats.""" +# --------------------------------------------------------------------------- +# Different error formats +# --------------------------------------------------------------------------- - def test_ssl_error_with_detailed_message(self): - """Test handling of SSL error with detailed message.""" + +class TestSSLErrorFormats: + def test_detailed_ssl_error(self): error = ssl.SSLError( - "[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate" + "[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: " + "unable to get local issuer certificate" ) message = get_ssl_error_message(error) - assert message is not None - assert len(message) > 0 assert "SSL certificate verification failed" in message def test_oserror_with_ssl_context(self): - """Test handling of OSError with SSL context.""" - error = OSError(1, "SSL: certificate verify failed") - message = get_ssl_error_message(error) - + message = get_ssl_error_message(OSError(1, "SSL: certificate verify failed")) assert message is not None assert len(message) > 0 def test_generic_exception_with_ssl_message(self): - """Test handling of generic exception with SSL message.""" - error = Exception("SSL certificate problem: unable to get local issuer certificate") - message = get_ssl_error_message(error) - + message = get_ssl_error_message( + Exception("SSL certificate problem: unable to get local issuer certificate") + ) assert message is not None assert len(message) > 0 -class TestSSLErrorDetectionEdgeCases: - """Test SSL error detection edge cases.""" +# --------------------------------------------------------------------------- +# Edge cases +# --------------------------------------------------------------------------- + +class TestSSLEdgeCases: def test_empty_error_message(self): - """Test handling of error with empty message.""" - error = Exception("") - assert is_ssl_certificate_error(error) is False + assert is_ssl_certificate_error(Exception("")) is False - def test_none_error_message(self): - """Test handling of error with None message.""" + def test_none_error_message_does_not_crash(self): error = Mock() error.__str__ = Mock(return_value=None) - # Should not crash - handle None return gracefully try: result = is_ssl_certificate_error(error) assert isinstance(result, bool) except (TypeError, AttributeError): - # If __str__ returns None, we should handle it gracefully - # This is acceptable behavior - function should not crash - assert True - - def test_ssl_keyword_case_insensitive(self): - """Test SSL keyword detection is case-insensitive.""" - error1 = Exception("SSL CERTIFICATE VERIFY FAILED") - error2 = Exception("ssl certificate verify failed") - error3 = Exception("Ssl Certificate Verify Failed") - - assert is_ssl_certificate_error(error1) is True - assert is_ssl_certificate_error(error2) is True - assert is_ssl_certificate_error(error3) is True - - def test_partial_ssl_keyword_match(self): - """Test partial SSL keyword matches are detected.""" - # "certificate" keyword alone should match - error = Exception("invalid certificate") - assert is_ssl_certificate_error(error) is True + pass # acceptable — function should not crash + + def test_case_insensitive_detection(self): + assert is_ssl_certificate_error(Exception("SSL CERTIFICATE VERIFY FAILED")) is True + assert is_ssl_certificate_error(Exception("ssl certificate verify failed")) is True + assert is_ssl_certificate_error(Exception("Ssl Certificate Verify Failed")) is True + + def test_partial_keyword_match(self): + assert is_ssl_certificate_error(Exception("invalid certificate")) is True + + def test_keyword_in_middle_of_message(self): + assert ( + is_ssl_certificate_error( + Exception("Connection failed due to SSL certificate verification error") + ) + is True + ) - def test_ssl_error_in_middle_of_message(self): - """Test SSL keywords in middle of message are detected.""" - error = Exception("Connection failed due to SSL certificate verification error") - assert is_ssl_certificate_error(error) is True +# --------------------------------------------------------------------------- +# Integration +# --------------------------------------------------------------------------- -class TestSSLHelperIntegration: - """Test SSL helper integration scenarios.""" - def test_can_identify_and_format_common_ssl_errors(self): - """Test can identify and format common SSL error scenarios.""" +class TestSSLIntegration: + def test_common_ssl_errors_identified_and_formatted(self): common_errors = [ ssl.SSLError("certificate verify failed"), Exception("[SSL: CERTIFICATE_VERIFY_FAILED]"), @@ -229,16 +181,12 @@ def test_can_identify_and_format_common_ssl_errors(self): ] for error in common_errors: - # Should be identified as SSL error assert is_ssl_certificate_error(error) is True - - # Should generate helpful message message = get_ssl_error_message(error) - assert len(message) > 100 # Should be substantial + assert len(message) > 100 assert "certifi" in message.lower() - def test_non_ssl_errors_dont_trigger_ssl_handling(self): - """Test non-SSL errors don't trigger SSL handling.""" + def test_non_ssl_errors_not_flagged(self): non_ssl_errors = [ ValueError("Invalid parameter"), KeyError("missing_key"), diff --git a/tests/unit/test_validation.py b/tests/unit/test_validation.py deleted file mode 100644 index 5bf955b..0000000 --- a/tests/unit/test_validation.py +++ /dev/null @@ -1 +0,0 @@ -"""Unit tests for validation.""" diff --git a/tests/unit/test_zone_manager.py b/tests/unit/test_zone_manager.py index 7185d65..0709c96 100644 --- a/tests/unit/test_zone_manager.py +++ b/tests/unit/test_zone_manager.py @@ -1,121 +1,97 @@ -"""Unit tests for ZoneManager.""" +"""Tests for core/zone_manager.py — Zone CRUD and ensure operations.""" import pytest -from unittest.mock import MagicMock + from brightdata.core.zone_manager import ZoneManager from brightdata.exceptions.errors import ZoneError, AuthenticationError +from tests.conftest import MockResponse, MockContextManager -class MockResponse: - """Mock aiohttp response for testing.""" - - def __init__(self, status: int, json_data=None, text_data=""): - self.status = status - self._json_data = json_data - self._text_data = text_data - - async def json(self): - return self._json_data - - async def text(self): - return self._text_data - - async def __aenter__(self): - return self - - async def __aexit__(self, exc_type, exc_val, exc_tb): - pass - - -@pytest.fixture -def mock_engine(): - """Create a mock engine for testing.""" - engine = MagicMock() - return engine +# --------------------------------------------------------------------------- +# List Zones +# --------------------------------------------------------------------------- -class TestZoneManagerListZones: - """Tests for listing zones.""" +class TestListZones: @pytest.mark.asyncio - async def test_list_zones_success(self, mock_engine): - """Test successful zone listing.""" + async def test_returns_zones_list(self, mock_engine): zones_data = [{"name": "zone1", "type": "unblocker"}, {"name": "zone2", "type": "serp"}] - mock_engine.get.return_value = MockResponse(200, json_data=zones_data) + mock_engine.get.return_value = MockContextManager(MockResponse(200, json_data=zones_data)) - zone_manager = ZoneManager(mock_engine) - zones = await zone_manager.list_zones() + zm = ZoneManager(mock_engine) + zones = await zm.list_zones() assert zones == zones_data mock_engine.get.assert_called_once_with("/zone/get_active_zones") @pytest.mark.asyncio - async def test_list_zones_empty(self, mock_engine): - """Test listing zones when none exist.""" - mock_engine.get.return_value = MockResponse(200, json_data=[]) + async def test_returns_empty_list_when_none(self, mock_engine): + mock_engine.get.return_value = MockContextManager(MockResponse(200, json_data=[])) - zone_manager = ZoneManager(mock_engine) - zones = await zone_manager.list_zones() + zm = ZoneManager(mock_engine) + zones = await zm.list_zones() assert zones == [] @pytest.mark.asyncio - async def test_list_zones_null_response(self, mock_engine): - """Test listing zones when API returns null.""" - mock_engine.get.return_value = MockResponse(200, json_data=None) + async def test_returns_empty_list_on_null_response(self, mock_engine): + mock_engine.get.return_value = MockContextManager(MockResponse(200, json_data=None)) - zone_manager = ZoneManager(mock_engine) - zones = await zone_manager.list_zones() + zm = ZoneManager(mock_engine) + zones = await zm.list_zones() assert zones == [] @pytest.mark.asyncio - async def test_list_zones_auth_error_401(self, mock_engine): - """Test listing zones with 401 authentication error.""" - mock_engine.get.return_value = MockResponse(401, text_data="Invalid token") + async def test_401_raises_authentication_error(self, mock_engine): + mock_engine.get.return_value = MockContextManager( + MockResponse(401, text_data="Invalid token") + ) - zone_manager = ZoneManager(mock_engine) + zm = ZoneManager(mock_engine) with pytest.raises(AuthenticationError) as exc_info: - await zone_manager.list_zones() + await zm.list_zones() assert "401" in str(exc_info.value) assert "Invalid token" in str(exc_info.value) @pytest.mark.asyncio - async def test_list_zones_auth_error_403(self, mock_engine): - """Test listing zones with 403 forbidden error.""" - mock_engine.get.return_value = MockResponse(403, text_data="Forbidden") + async def test_403_raises_authentication_error(self, mock_engine): + mock_engine.get.return_value = MockContextManager(MockResponse(403, text_data="Forbidden")) - zone_manager = ZoneManager(mock_engine) + zm = ZoneManager(mock_engine) with pytest.raises(AuthenticationError) as exc_info: - await zone_manager.list_zones() + await zm.list_zones() assert "403" in str(exc_info.value) @pytest.mark.asyncio - async def test_list_zones_api_error(self, mock_engine): - """Test listing zones with general API error.""" - mock_engine.get.return_value = MockResponse(500, text_data="Internal server error") + async def test_500_raises_zone_error(self, mock_engine): + mock_engine.get.return_value = MockContextManager( + MockResponse(500, text_data="Internal server error") + ) - zone_manager = ZoneManager(mock_engine) + zm = ZoneManager(mock_engine) with pytest.raises(ZoneError) as exc_info: - await zone_manager.list_zones() + await zm.list_zones() assert "500" in str(exc_info.value) -class TestZoneManagerCreateZone: - """Tests for zone creation.""" +# --------------------------------------------------------------------------- +# Create Zone +# --------------------------------------------------------------------------- + +class TestCreateZone: @pytest.mark.asyncio - async def test_create_unblocker_zone_success(self, mock_engine): - """Test creating an unblocker zone successfully.""" - mock_engine.post.return_value = MockResponse(201) + async def test_creates_unblocker_zone(self, mock_engine): + mock_engine.post.return_value = MockContextManager(MockResponse(201)) - zone_manager = ZoneManager(mock_engine) - await zone_manager._create_zone("test_unblocker", "unblocker") + zm = ZoneManager(mock_engine) + await zm._create_zone("test_unblocker", "unblocker") - # Verify the POST was called with correct payload mock_engine.post.assert_called_once() call_args = mock_engine.post.call_args assert call_args[0][0] == "/zone" @@ -125,238 +101,218 @@ async def test_create_unblocker_zone_success(self, mock_engine): assert payload["plan"]["type"] == "unblocker" @pytest.mark.asyncio - async def test_create_serp_zone_success(self, mock_engine): - """Test creating a SERP zone successfully.""" - mock_engine.post.return_value = MockResponse(200) + async def test_creates_serp_zone(self, mock_engine): + mock_engine.post.return_value = MockContextManager(MockResponse(200)) - zone_manager = ZoneManager(mock_engine) - await zone_manager._create_zone("test_serp", "serp") + zm = ZoneManager(mock_engine) + await zm._create_zone("test_serp", "serp") - # Verify the POST was called with correct payload - call_args = mock_engine.post.call_args - payload = call_args[1]["json_data"] + payload = mock_engine.post.call_args[1]["json_data"] assert payload["zone"]["name"] == "test_serp" assert payload["zone"]["type"] == "serp" assert payload["plan"]["type"] == "unblocker" assert payload["plan"]["serp"] is True @pytest.mark.asyncio - async def test_create_browser_zone_success(self, mock_engine): - """Test creating a browser zone successfully.""" - mock_engine.post.return_value = MockResponse(201) + async def test_creates_browser_zone(self, mock_engine): + mock_engine.post.return_value = MockContextManager(MockResponse(201)) - zone_manager = ZoneManager(mock_engine) - await zone_manager._create_zone("test_browser", "browser") + zm = ZoneManager(mock_engine) + await zm._create_zone("test_browser", "browser") - call_args = mock_engine.post.call_args - payload = call_args[1]["json_data"] + payload = mock_engine.post.call_args[1]["json_data"] assert payload["zone"]["name"] == "test_browser" assert payload["zone"]["type"] == "browser" assert payload["plan"]["type"] == "browser" @pytest.mark.asyncio - async def test_create_zone_already_exists_409(self, mock_engine): - """Test creating a zone that already exists (409).""" - mock_engine.post.return_value = MockResponse(409, text_data="Conflict") + async def test_409_conflict_does_not_raise(self, mock_engine): + mock_engine.post.return_value = MockContextManager(MockResponse(409, text_data="Conflict")) - zone_manager = ZoneManager(mock_engine) - # Should not raise an exception - await zone_manager._create_zone("existing_zone", "unblocker") + zm = ZoneManager(mock_engine) + await zm._create_zone("existing_zone", "unblocker") # should not raise @pytest.mark.asyncio - async def test_create_zone_already_exists_message(self, mock_engine): - """Test creating a zone with duplicate message in response.""" - mock_engine.post.return_value = MockResponse(400, text_data="Zone already exists") + async def test_already_exists_message_does_not_raise(self, mock_engine): + mock_engine.post.return_value = MockContextManager( + MockResponse(400, text_data="Zone already exists") + ) - zone_manager = ZoneManager(mock_engine) - # Should not raise an exception - await zone_manager._create_zone("existing_zone", "unblocker") + zm = ZoneManager(mock_engine) + await zm._create_zone("existing_zone", "unblocker") # should not raise @pytest.mark.asyncio - async def test_create_zone_duplicate_message(self, mock_engine): - """Test creating a zone with duplicate name error.""" - mock_engine.post.return_value = MockResponse(400, text_data="Duplicate zone name") + async def test_duplicate_name_message_does_not_raise(self, mock_engine): + mock_engine.post.return_value = MockContextManager( + MockResponse(400, text_data="Duplicate zone name") + ) - zone_manager = ZoneManager(mock_engine) - # Should not raise an exception - await zone_manager._create_zone("duplicate_zone", "unblocker") + zm = ZoneManager(mock_engine) + await zm._create_zone("duplicate_zone", "unblocker") # should not raise @pytest.mark.asyncio - async def test_create_zone_auth_error_401(self, mock_engine): - """Test zone creation with authentication error.""" - mock_engine.post.return_value = MockResponse(401, text_data="Unauthorized") + async def test_401_raises_authentication_error(self, mock_engine): + mock_engine.post.return_value = MockContextManager( + MockResponse(401, text_data="Unauthorized") + ) - zone_manager = ZoneManager(mock_engine) + zm = ZoneManager(mock_engine) with pytest.raises(AuthenticationError) as exc_info: - await zone_manager._create_zone("test_zone", "unblocker") + await zm._create_zone("test_zone", "unblocker") assert "401" in str(exc_info.value) @pytest.mark.asyncio - async def test_create_zone_auth_error_403(self, mock_engine): - """Test zone creation with forbidden error.""" - mock_engine.post.return_value = MockResponse(403, text_data="Forbidden") + async def test_403_raises_authentication_error(self, mock_engine): + mock_engine.post.return_value = MockContextManager(MockResponse(403, text_data="Forbidden")) - zone_manager = ZoneManager(mock_engine) + zm = ZoneManager(mock_engine) with pytest.raises(AuthenticationError) as exc_info: - await zone_manager._create_zone("test_zone", "unblocker") + await zm._create_zone("test_zone", "unblocker") assert "403" in str(exc_info.value) @pytest.mark.asyncio - async def test_create_zone_bad_request(self, mock_engine): - """Test zone creation with bad request error.""" - mock_engine.post.return_value = MockResponse(400, text_data="Invalid zone configuration") + async def test_400_bad_request_raises_zone_error(self, mock_engine): + mock_engine.post.return_value = MockContextManager( + MockResponse(400, text_data="Invalid zone configuration") + ) - zone_manager = ZoneManager(mock_engine) + zm = ZoneManager(mock_engine) with pytest.raises(ZoneError) as exc_info: - await zone_manager._create_zone("test_zone", "unblocker") + await zm._create_zone("test_zone", "unblocker") assert "400" in str(exc_info.value) assert "Invalid zone configuration" in str(exc_info.value) -class TestZoneManagerEnsureZones: - """Tests for ensuring zones exist.""" +# --------------------------------------------------------------------------- +# Ensure Required Zones +# --------------------------------------------------------------------------- + +class TestEnsureRequiredZones: @pytest.mark.asyncio - async def test_ensure_zones_all_exist(self, mock_engine): - """Test ensuring zones when all already exist.""" + async def test_skips_creation_when_all_exist(self, mock_engine): zones_data = [ {"name": "sdk_unlocker", "type": "unblocker"}, {"name": "sdk_serp", "type": "serp"}, ] - mock_engine.get.return_value = MockResponse(200, json_data=zones_data) + mock_engine.get.return_value = MockContextManager(MockResponse(200, json_data=zones_data)) - zone_manager = ZoneManager(mock_engine) - await zone_manager.ensure_required_zones( - web_unlocker_zone="sdk_unlocker", serp_zone="sdk_serp" - ) + zm = ZoneManager(mock_engine) + await zm.ensure_required_zones(web_unlocker_zone="sdk_unlocker", serp_zone="sdk_serp") - # Should only call GET to list zones, not POST to create mock_engine.get.assert_called() mock_engine.post.assert_not_called() @pytest.mark.asyncio - async def test_ensure_zones_create_missing(self, mock_engine): - """Test ensuring zones when some need to be created.""" - # First call: existing zones (empty) - # After creation: zones exist + async def test_creates_missing_zones(self, mock_engine): mock_engine.get.side_effect = [ - MockResponse(200, json_data=[]), # Initial list - MockResponse( - 200, - json_data=[ # Verification list - {"name": "sdk_unlocker", "type": "unblocker"}, - {"name": "sdk_serp", "type": "serp"}, - ], + MockContextManager(MockResponse(200, json_data=[])), + MockContextManager( + MockResponse( + 200, + json_data=[ + {"name": "sdk_unlocker", "type": "unblocker"}, + {"name": "sdk_serp", "type": "serp"}, + ], + ) ), ] - mock_engine.post.return_value = MockResponse(201) + mock_engine.post.return_value = MockContextManager(MockResponse(201)) - zone_manager = ZoneManager(mock_engine) - await zone_manager.ensure_required_zones( - web_unlocker_zone="sdk_unlocker", serp_zone="sdk_serp" - ) + zm = ZoneManager(mock_engine) + await zm.ensure_required_zones(web_unlocker_zone="sdk_unlocker", serp_zone="sdk_serp") - # Should create both zones assert mock_engine.post.call_count == 2 @pytest.mark.asyncio - async def test_ensure_zones_only_web_unlocker(self, mock_engine): - """Test ensuring only web unlocker zone.""" + async def test_creates_only_web_unlocker(self, mock_engine): mock_engine.get.side_effect = [ - MockResponse(200, json_data=[]), - MockResponse(200, json_data=[{"name": "sdk_unlocker"}]), + MockContextManager(MockResponse(200, json_data=[])), + MockContextManager(MockResponse(200, json_data=[{"name": "sdk_unlocker"}])), ] - mock_engine.post.return_value = MockResponse(201) + mock_engine.post.return_value = MockContextManager(MockResponse(201)) - zone_manager = ZoneManager(mock_engine) - await zone_manager.ensure_required_zones(web_unlocker_zone="sdk_unlocker") + zm = ZoneManager(mock_engine) + await zm.ensure_required_zones(web_unlocker_zone="sdk_unlocker") - # Should only create web unlocker zone assert mock_engine.post.call_count == 1 @pytest.mark.asyncio - async def test_ensure_zones_with_browser(self, mock_engine): - """Test ensuring unblocker and SERP zones (browser zones NOT auto-created).""" + async def test_creates_unblocker_and_serp(self, mock_engine): mock_engine.get.side_effect = [ - MockResponse(200, json_data=[]), - MockResponse(200, json_data=[{"name": "sdk_unlocker"}, {"name": "sdk_serp"}]), + MockContextManager(MockResponse(200, json_data=[])), + MockContextManager( + MockResponse( + 200, + json_data=[ + {"name": "sdk_unlocker"}, + {"name": "sdk_serp"}, + ], + ) + ), ] - mock_engine.post.return_value = MockResponse(201) + mock_engine.post.return_value = MockContextManager(MockResponse(201)) - zone_manager = ZoneManager(mock_engine) - await zone_manager.ensure_required_zones( - web_unlocker_zone="sdk_unlocker", - serp_zone="sdk_serp", - ) + zm = ZoneManager(mock_engine) + await zm.ensure_required_zones(web_unlocker_zone="sdk_unlocker", serp_zone="sdk_serp") - # Should only create unblocker + SERP zones (browser zones require manual setup) assert mock_engine.post.call_count == 2 @pytest.mark.asyncio - async def test_ensure_zones_verification_fails(self, mock_engine, caplog): - """Test zone creation when verification fails (logs warning but doesn't raise).""" - # Zones never appear in verification (max_attempts = 5, so need 6 total responses) + async def test_verification_failure_logs_warning(self, mock_engine, caplog): mock_engine.get.side_effect = [ - MockResponse(200, json_data=[]), # Initial list - MockResponse(200, json_data=[]), # Verification attempt 1 - MockResponse(200, json_data=[]), # Verification attempt 2 - MockResponse(200, json_data=[]), # Verification attempt 3 - MockResponse(200, json_data=[]), # Verification attempt 4 - MockResponse(200, json_data=[]), # Verification attempt 5 (final) + MockContextManager(MockResponse(200, json_data=[])), # initial list + MockContextManager(MockResponse(200, json_data=[])), # verify 1 + MockContextManager(MockResponse(200, json_data=[])), # verify 2 + MockContextManager(MockResponse(200, json_data=[])), # verify 3 + MockContextManager(MockResponse(200, json_data=[])), # verify 4 + MockContextManager(MockResponse(200, json_data=[])), # verify 5 ] - mock_engine.post.return_value = MockResponse(201) + mock_engine.post.return_value = MockContextManager(MockResponse(201)) - zone_manager = ZoneManager(mock_engine) - # Verification failure should log warning but NOT raise exception - await zone_manager.ensure_required_zones(web_unlocker_zone="sdk_unlocker") + zm = ZoneManager(mock_engine) + await zm.ensure_required_zones(web_unlocker_zone="sdk_unlocker") - # Should have logged warning about verification failure - assert any("Zone verification failed" in record.message for record in caplog.records) + assert any("Zone verification failed" in r.message for r in caplog.records) -class TestZoneManagerIntegration: - """Integration-style tests for ZoneManager.""" +# --------------------------------------------------------------------------- +# Integration-style +# --------------------------------------------------------------------------- + +class TestZoneManagerIntegration: @pytest.mark.asyncio - async def test_full_workflow_no_zones_to_create(self, mock_engine): - """Test full workflow when zones already exist.""" + async def test_full_workflow_no_creation_needed(self, mock_engine): zones_data = [{"name": "my_zone", "type": "unblocker", "status": "active"}] - mock_engine.get.return_value = MockResponse(200, json_data=zones_data) + mock_engine.get.return_value = MockContextManager(MockResponse(200, json_data=zones_data)) - zone_manager = ZoneManager(mock_engine) + zm = ZoneManager(mock_engine) - # List zones - zones = await zone_manager.list_zones() + zones = await zm.list_zones() assert len(zones) == 1 assert zones[0]["name"] == "my_zone" - # Ensure zones (should not create any) - await zone_manager.ensure_required_zones(web_unlocker_zone="my_zone") + await zm.ensure_required_zones(web_unlocker_zone="my_zone") mock_engine.post.assert_not_called() @pytest.mark.asyncio - async def test_full_workflow_create_zones(self, mock_engine): - """Test full workflow creating new zones.""" + async def test_full_workflow_creates_then_lists(self, mock_engine): zones_after = [{"name": "new_zone", "type": "unblocker"}] mock_engine.get.side_effect = [ - MockResponse(200, json_data=[]), # Initial list (empty) - MockResponse(200, json_data=zones_after), # After creation (verification) - MockResponse(200, json_data=zones_after), # List zones again + MockContextManager(MockResponse(200, json_data=[])), + MockContextManager(MockResponse(200, json_data=zones_after)), + MockContextManager(MockResponse(200, json_data=zones_after)), ] - mock_engine.post.return_value = MockResponse(201) - - zone_manager = ZoneManager(mock_engine) - - # Ensure zones (should create) - await zone_manager.ensure_required_zones(web_unlocker_zone="new_zone") + mock_engine.post.return_value = MockContextManager(MockResponse(201)) - # Verify zone was created + zm = ZoneManager(mock_engine) + await zm.ensure_required_zones(web_unlocker_zone="new_zone") assert mock_engine.post.call_count == 1 - # List zones again - zones = await zone_manager.list_zones() + zones = await zm.list_zones() assert len(zones) == 1 assert zones[0]["name"] == "new_zone"