Turn any webpage into structured data (10 free queries remaining)
Login or register for 1000 free API credits and advanced LLM extraction
API Credits Remaining: 0
Crawl the URL below, or enter your own URL for crawling
Your API Key:
Simple scraping without extraction costs 1 API credit per API call
Basic scraping that ingests an URL and outputs markdown content and other essential information such as page metadata, multimedia content, links, etc.
import requests response = requests.post( '/query', json={ 'url': 'https://www.kidocode.com/degrees/technology', 'apikey': } ) print(response.json())
fetch('/query', { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ url: 'https://www.kidocode.com/degrees/technology', apikey: }) }) .then(response => response.json()) .then(data => console.log(data));
Scraping + extraction costs 2 API credits per API call
Scrape + extract with LLM-based extraction logic. π¦ΎRecommended for use cases that require reliable extraction results but the underlying webpage structure frequently and unpredictably changes
import requests # Define schema for structured extraction llm_schema = { 'course_name': 'name of the course offered', } # Instruction for LLM extraction llm_instruction = '''Extract the the course name of each item listed in the Explore our future-forward courses section. The extraction should look like: {'course_name':'Coding with Python'}''' # Make request to public API response = requests.post( '/query', json={ 'url': 'https://www.kidocode.com/degrees/technology', 'apikey': , 'llm_instruction': llm_instruction, 'llm_schema': llm_schema, 'cache_mode': 'bypass' } ) # Print structured response print(response.json()['extractions'])
// Define schema for structured extraction const llmSchema = { course_name: 'name of the course offered' }; // Instruction for LLM extraction const llmInstruction = `Extract the the course name of each item listed in the Explore our future-forward courses section. The extraction should look like: {'course_name':'Coding with Python'}`; // Make request to public API fetch('/query', { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ url: 'https://www.kidocode.com/degrees/technology', apikey: , llm_instruction: llmInstruction, llm_schema: llmSchema, cache_mode: 'bypass' }) }) .then(response => response.json()) .then(data => console.log(data.extractions));
Scraping + extraction costs 2 API credits per API call
Extract structured data from web pages using CSS selectors and optional JavaScript pre-processing, with no LLM needed. β‘Recommended for use cases that value fast extraction on mostly static webpage structures.
import requests import json url = 'https://www.kidocode.com/degrees/technology' # JavaScript to execute before extraction js_code = """ (async () => { const tabs = document.querySelectorAll("section.charge-methodology .tabs-menu-3 > div"); for(let tab of tabs) { // scroll to the tab tab.scrollIntoView(); tab.click(); // Wait for content to load and animations to complete await new Promise(r => setTimeout(r, 500)); } })(); """ # Define extraction schema using CSS selectors json_css_schema = { "name": "KidoCode Courses", "baseSelector": "section.charge-methodology .div-block-214.p-extraxx", "fields": [ { "name": "section_title", "selector": "h3.heading-50", "type": "text", }, { "name": "section_description", "selector": ".charge-content", "type": "text", }, { "name": "course_name", "selector": ".text-block-93", "type": "text", }, { "name": "course_description", "selector": ".course-content-text", "type": "text", }, { "name": "course_icon", "selector": ".image-92", "type": "attribute", "attribute": "src" } ] } # Make API request response = requests.post( '/query', json={ 'url': url, 'apikey': , 'js_code': js_code, 'json_css_schema': json_css_schema, 'cache_mode': 'bypass' } ) print(response.json()['extractions'])
const url = 'https://www.kidocode.com/degrees/technology'; // JavaScript to execute before extraction const js_code = ` (async () => { const tabs = document.querySelectorAll("section.charge-methodology .tabs-menu-3 > div"); for(let tab of tabs) { // scroll to the tab tab.scrollIntoView(); tab.click(); // Wait for content to load and animations to complete await new Promise(r => setTimeout(r, 500)); } })(); `; // Define extraction schema using CSS selectors const json_css_schema = { name: "KidoCode Courses", baseSelector: "section.charge-methodology .div-block-214.p-extraxx", fields: [ { name: "section_title", selector: "h3.heading-50", type: "text", }, { name: "section_description", selector: ".charge-content", type: "text", }, { name: "course_name", selector: ".text-block-93", type: "text", }, { name: "course_description", selector: ".course-content-text", type: "text", }, { name: "course_icon", selector: ".image-92", type: "attribute", attribute: "src" } ] }; // Make API request fetch('/query', { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ url: url, apikey: , js_code: js_code, json_css_schema: json_css_schema, cache_mode: 'bypass' }) }) .then(response => response.json()) .then(data => console.log(data.extractions));
Control the format of the crawled content using the output_format parameter. Review detailed documentation on the 4 output options: html, cleaned_html, markdown, fit_markdown.
import requests url = 'https://www.kidocode.com/degrees/technology' # Make API request with output format response = requests.post( '/query', json={ 'url': url, 'apikey': , 'output_format': 'html' # Options: html, cleaned_html, markdown, fit_markdown } ) print(response.json())
const url = 'https://www.kidocode.com/degrees/technology'; fetch('/query', { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ url: url, apikey: , output_format: 'html' // Options: html, cleaned_html, markdown, fit_markdown }) }) .then(response => response.json()) .then(data => console.log(data));
Execute custom JavaScript code on the target page before extraction
import requests # JavaScript code to execute on the page js_code = """ window.scrollTo(0, document.body.scrollHeight); document.querySelector('#fr-submit-btn').click(); """ response = requests.post( '/query', json={ 'url': 'https://www.kidocode.com/franchise', 'apikey': , 'js_code': js_code, 'cache_mode': 'bypass' } ) print(response.json())
// JavaScript code to execute on the page const js_code = ` window.scrollTo(0, document.body.scrollHeight); document.querySelector('#fr-submit-btn').click(); `; fetch('/query', { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ url: 'https://www.kidocode.com/franchise', apikey: , js_code: js_code, cache_mode: 'bypass' }) }) .then(response => response.json()) .then(data => console.log(data));
Perfect for interactive pages that require user actions before content is available
Enable comprehensive anti-bot protection bypass with the magic parameter. Review detailed documentation.
import requests url = 'https://www.kidocode.com/degrees/technology' response = requests.post( '/query', json={ 'url': url, 'apikey': , 'magic': True # Enables all anti-detection features } ) print(response.json())
const url = 'https://www.kidocode.com/degrees/technology'; fetch('/query', { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ url: url, apikey: , magic: true // Enables all anti-detection features }) }) .then(response => response.json()) .then(data => console.log(data));
Enable crawling of content within iframes:
import requests url = 'https://www.kidocode.com/degrees/technology' response = requests.post( '/query', json={ 'url': url, 'apikey': , 'process_iframes': True # Include content from iframes } ) print(response.json())
const url = 'https://www.kidocode.com/degrees/technology'; fetch('/query', { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ url: url, apikey: , process_iframes: true // Include content from iframes }) }) .then(response => response.json()) .then(data => console.log(data));
Remove popups, ads, and other overlay elements during crawling:
import requests url = 'https://www.kidocode.com/degrees/technology' response = requests.post( '/query', json={ 'url': url, 'apikey': , 'remove_overlay_elements': True # Remove popups and overlays } ) print(response.json())
const url = 'https://www.kidocode.com/degrees/technology'; fetch('/query', { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ url: url, apikey: , remove_overlay_elements: true // Remove popups and overlays }) }) .then(response => response.json()) .then(data => console.log(data));
Filter out specific HTML elements during content extraction:
import requests url = 'https://www.kidocode.com/degrees/technology' response = requests.post( '/query', json={ 'url': url, 'apikey': , 'excluded_tags': ['nav', 'form'] # Ignore navigation and forms } ) print(response.json())
const url = 'https://www.kidocode.com/degrees/technology'; fetch('/query', { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ url: url, apikey: , excluded_tags: ['nav', 'form'] // Ignore navigation and forms }) }) .then(response => response.json()) .then(data => console.log(data));
Specify elements to wait for before processing the page:
import requests url = 'https://www.kidocode.com/degrees/technology' response = requests.post( '/query', json={ 'url': url, 'apikey': , 'wait_for': 'css:.dynamic-content' # Wait for element to appear } ) print(response.json())
const url = 'https://www.kidocode.com/degrees/technology'; fetch('/query', { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ url: url, apikey: , wait_for: 'css:.dynamic-content' // Wait for element to appear }) }) .then(response => response.json()) .then(data => console.log(data));
Format: Use "css:.selector" for CSS selectors or "xpath://div" for XPath expressions
Focus content extraction on specific page elements:
import requests url = 'https://www.kidocode.com/degrees/technology' response = requests.post( '/query', json={ 'url': url, 'apikey': , 'css_selector': '.margin-bottom-24px' # Only process matching elements } ) print(response.json())
const url = 'https://www.kidocode.com/degrees/technology'; fetch('/query', { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ url: url, apikey: , css_selector: '.margin-bottom-24px' // Only process matching elements }) }) .then(response => response.json()) .then(data => console.log(data));
Use standard CSS selector syntax to identify target elements
Filter content blocks based on minimum word count:
import requests url = 'https://www.kidocode.com/degrees/technology' response = requests.post( '/query', json={ 'url': url, 'apikey': , 'word_count_threshold': 10 # Minimum words per content block } ) print(response.json())
const url = 'https://www.kidocode.com/degrees/technology'; fetch('/query', { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ url: url, apikey: , word_count_threshold: 10 // Minimum words per content block }) }) .then(response => response.json()) .then(data => console.log(data));
Set threshold based on your content requirements
Take screenshots of web pages with optional delay:
import requests import base64 url = 'https://www.kidocode.com/degrees/technology' response = requests.post( '/query', json={ 'url': url, 'apikey': , 'screenshot': True, # Take screenshot 'screenshot_wait_for': 2.0 # Wait 2 seconds before capture } ) # Save screenshot to file data = response.json() with open("screenshot.png", "wb") as f: f.write(base64.b64decode(data['screenshot']))
const url = 'https://www.kidocode.com/degrees/technology'; fetch('/query', { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ url: url, apikey: , screenshot: true, // Take screenshot screenshot_wait_for: 2.0 // Wait 2 seconds before capture }) }) .then(response => response.json()) .then(data => { // Convert base64 to blob and create download link const blob = new Blob([Uint8Array.from(atob(data.screenshot), c => c.charCodeAt(0))], {type: 'image/png'}); const url = URL.createObjectURL(blob); const a = document.createElement('a'); a.href = url; a.download = 'screenshot.png'; a.click(); URL.revokeObjectURL(url); });
Use screenshot_wait_for parameter to ensure dynamic content is loaded
Note: Due to payload size and latency considerations, LLM extraction and JSON-CSS extraction will be disabled when screenshot is enabled.
Control caching behavior for content retrieval:
import requests url = 'https://www.kidocode.com/degrees/technology' response = requests.post( '/query', json={ 'url': url, 'apikey': , 'cache_mode': 'bypass' # Always fetch fresh content } ) print(response.json())
const url = 'https://www.kidocode.com/degrees/technology'; fetch('/query', { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ url: url, apikey: , cache_mode: 'bypass' // Always fetch fresh content }) }) .then(response => response.json()) .then(data => console.log(data));
Note: Using cache bypass may increase response times
Extract structured data using LLM instructions and optional schema definitions:
import requests url = 'https://www.kidocode.com/degrees/technology' # Define schema for structured extraction llm_schema = { 'course_name': 'name of the course offering', 'course_description': 'description of the course offering', } # Instruction for LLM extraction llm_instruction = '''Extract the course_name and course_description of each course.''' # Make API request response = requests.post( '/query', json={ 'url': url, 'apikey': , 'llm_instruction': llm_instruction, 'llm_schema': llm_schema, 'input_format': 'markdown', # valid values are markdown (default), fit_markdown, and html 'cache_mode': 'bypass' } ) print(response.json()['extractions'])
const url = 'https://www.kidocode.com/degrees/technology'; // Define schema for structured extraction const llm_schema = { course_name: 'name of the course offering', course_description: 'description of the course offering' }; // Instruction for LLM extraction const llm_instruction = `Extract the course_name and course_description of each course.`; // Make API request fetch('/query', { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ url: url, apikey: , llm_instruction: llm_instruction, llm_schema: llm_schema, input_format: 'markdown', //valid values are markdown (default), fit_markdown, and html cache_mode: 'bypass' }) }) .then(response => response.json()) .then(data => console.log(data.extractions));
Note: If no schema is provided, the LLM will infer the structure from the instruction.
The JSON-CSS-based extraction is a powerful feature of Crawl4AI that allows you to extract structured data from web pages using CSS selectors. This method is particularly useful when you need to extract specific data points from a consistent HTML structure, such as tables or repeated elements. Here's how to use it with the AsyncWebCrawler. All you need is to define a schema that specifies: 1. A base CSS selector for the repeating elements 2. Fields to extract from each element, each with its own CSS selector. This strategy is fast and efficient, as it doesn't rely on external services like LLMs for extraction.
import requests import json url = 'https://www.kidocode.com/degrees/technology' # Define extraction schema using CSS selectors json_css_schema = { "name": "KidoCode Courses", "baseSelector": "section.charge-methodology .div-block-214.p-extraxx", "fields": [ { "name": "section_title", "selector": "h3.heading-50", "type": "text", }, { "name": "section_description", "selector": ".charge-content", "type": "text", }, { "name": "course_name", "selector": ".text-block-93", "type": "text", }, { "name": "course_description", "selector": ".course-content-text", "type": "text", }, { "name": "course_icon", "selector": ".image-92", "type": "attribute", "attribute": "src" } ] } # Make API request response = requests.post( '/query', json={ 'url': url, 'apikey': , 'json_css_schema': json_css_schema, 'cache_mode': 'bypass' } ) print(response.json()['extractions'])
const url = 'https://www.kidocode.com/degrees/technology'; // Define extraction schema using CSS selectors const json_css_schema = { name: "KidoCode Courses", baseSelector: "section.charge-methodology .div-block-214.p-extraxx", fields: [ { name: "section_title", selector: "h3.heading-50", type: "text", }, { name: "section_description", selector: ".charge-content", type: "text", }, { name: "course_name", selector: ".text-block-93", type: "text", }, { name: "course_description", selector: ".course-content-text", type: "text", }, { name: "course_icon", selector: ".image-92", type: "attribute", attribute: "src" } ] }; // Make API request fetch('/query', { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ url: url, apikey: , json_css_schema: json_css_schema, 'cache_mode': 'bypass' }) }) .then(response => response.json()) .then(data => console.log(data.extractions));
Don't want to manually create JSON CSS schema? You can use this utility/helper API to turn raw HTML content into the corresponding JSON CSS schema.
import requests html = """ <div class="product-card"> <h2 class="title">Gaming Laptop</h2> <div class="price">$999.99</div> <div class="specs"> <ul> <li>16GB RAM</li> <li>1TB SSD</li> </ul> </div> </div> """ # Make API request response = requests.post( '/query', json={ 'utility_mode': 'json_css_schema_generator', 'html':html, 'apikey': , } ) print(response.json())
const html = ` <div class="product-card"> <h2 class="title">Gaming Laptop</h2> <div class="price">$999.99</div> <div class="specs"> <ul> <li>16GB RAM</li> <li>1TB SSD</li> </ul> </div> </div> `; // Make API request fetch('/query', { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ utility_mode: 'json_css_schema_generator', html:html, apikey: }) }) .then(response => response.json()) .then(data => console.log(data));