mapillary_downloader
Mapillary data downloader.
mapillary_downloader.ia_stats
Internet Archive statistics for mapillary_downloader collections.
search_ia_collections
def search_ia_collections()
Search IA for all mapillary_downloader collections.
Returns:
List of dicts with: identifier, description, item_size, uploader
parse_collection_info
def parse_collection_info(identifier)
Parse username, quality, webp from collection identifier.
Returns:
dict with username, quality, is_webp or None if invalid
extract_image_count
def extract_image_count(description)
Extract image count from IA description field.
Description format: “Contains 12,345 images in…”
load_cache
def load_cache()
Load cached collection data.
Returns:
dict of {collection_id: {size, uploader, images, quality, username}}
update_cache
def update_cache(ia_collections)
Update cache with new IA search results.
Merges new collections into existing cache.
Returns:
Updated cache dict
aggregate_stats
def aggregate_stats(cache)
Aggregate statistics from cached collection data.
Returns:
dict with total and per-quality stats
format_stats
def format_stats(stats)
Format statistics as human-readable text.
Arguments:
stats- Dict from aggregate_stats()
Returns:
Formatted string
show_stats
def show_stats(refresh=True)
Show archive.org statistics for mapillary_downloader collections.
Arguments:
refresh- If True, fetch fresh data from IA. If False, use cache only.
mapillary_downloader.webp_converter
WebP image conversion utilities.
check_cwebp_available
def check_cwebp_available()
Check if cwebp binary is available.
Returns:
bool- True if cwebp is found, False otherwise
convert_to_webp
def convert_to_webp(jpg_path, output_path=None, delete_original=True)
Convert a JPG image to WebP format, preserving EXIF metadata.
Arguments:
jpg_path- Path to the JPG fileoutput_path- Optional path for the WebP output. If None, uses jpg_path with .webp extensiondelete_original- Whether to delete the original JPG after conversion (default: True)
Returns:
Path object to the new WebP file, or None if conversion failed
mapillary_downloader.exif_writer
EXIF metadata writer for Mapillary images.
decimal_to_dms
def decimal_to_dms(decimal)
Convert decimal degrees to degrees, minutes, seconds format for EXIF.
Arguments:
decimal- Decimal degrees (can be negative)
Returns:
Tuple of ((degrees, 1), (minutes, 1), (seconds, 100)) as rational numbers
timestamp_to_exif_datetime
def timestamp_to_exif_datetime(timestamp)
Convert Unix timestamp to EXIF datetime string.
Arguments:
timestamp- Unix timestamp in milliseconds
Returns:
String in format “YYYY:MM:DD HH:MM:SS”
write_exif_to_image
def write_exif_to_image(image_path, metadata)
Write EXIF metadata from Mapillary API to downloaded image.
Arguments:
image_path- Path to the downloaded image filemetadata- Dictionary of metadata from Mapillary API
Returns:
True if successful, False otherwise
mapillary_downloader.utils
Utility functions for formatting and display.
format_size
def format_size(bytes_count)
Format bytes as human-readable size.
Arguments:
bytes_count- Number of bytes
Returns:
Formatted string (e.g. “1.23 GB”, “456.78 MB”)
format_time
def format_time(seconds)
Format seconds as human-readable time.
Arguments:
seconds- Number of seconds
Returns:
Formatted string (e.g. “2h 15m”, “45m 30s”, “30s”)
safe_json_save
def safe_json_save(file_path, data)
Atomically save JSON data to file.
Writes to temp file, then atomic rename to prevent corruption.
Arguments:
file_path- Path to JSON filedata- Data to serialize to JSON
http_get_with_retry
def http_get_with_retry(url,
params=None,
max_retries=5,
base_delay=1.0,
timeout=60)
HTTP GET with exponential backoff retry.
Arguments:
url- URL to fetchparams- Optional query parametersmax_retries- Maximum retry attempts (default: 5)base_delay- Initial delay in seconds (default: 1.0)timeout- Request timeout in seconds (default: 60)
Returns:
requests.Response object
Raises:
requests.RequestException- If all retries exhausted
mapillary_downloader.tar_sequences
Tar sequence directories for efficient Internet Archive uploads.
tar_sequence_directories
def tar_sequence_directories(collection_dir)
Tar all date directories in a collection for faster IA uploads.
Organizes by capture date (YYYY-MM-DD) for incremental archive.org uploads.
Arguments:
collection_dir- Path to collection directory (e.g., mapillary-user-quality/)
Returns:
Tuple of (tarred_count, total_files_tarred)
mapillary_downloader.ia_check
Check if collections exist on Internet Archive.
check_ia_exists
def check_ia_exists(collection_name)
Check if a collection exists on Internet Archive.
Arguments:
collection_name- Name of the collection (e.g., mapillary-username-original-webp)
Returns:
Boolean indicating if the collection exists on IA
mapillary_downloader.__main__
CLI entry point.
main
def main()
Main CLI entry point.
mapillary_downloader.downloader
Main downloader logic.
get_cache_dir
def get_cache_dir()
Get XDG cache directory for staging downloads.
Returns:
Path to cache directory for mapillary_downloader
MapillaryDownloader Objects
class MapillaryDownloader()
Handles downloading Mapillary data for a user.
__init__
def __init__(client,
output_dir,
username=None,
quality=None,
max_workers=128,
tar_sequences=True,
convert_webp=False,
check_ia=True)
Initialize the downloader.
Arguments:
client- MapillaryClient instanceoutput_dir- Base directory to save downloads (final destination)username- Mapillary username (for collection directory)quality- Image quality (for collection directory)max_workers- Maximum number of parallel workers (default: 128)tar_sequences- Whether to tar sequence directories after download (default: True)convert_webp- Whether to convert images to WebP (affects collection name)check_ia- Whether to check if collection exists on Internet Archive (default: True)
download_user_data
def download_user_data(bbox=None, convert_webp=False)
Download all images for a user using streaming queue-based architecture.
Arguments:
bbox- Optional bounding box [west, south, east, north]convert_webp- Convert images to WebP format after download
mapillary_downloader.metadata_reader
Streaming metadata reader with filtering.
MetadataReader Objects
class MetadataReader()
Streams metadata.jsonl line-by-line with filtering.
This avoids loading millions of image dicts into memory.
__init__
def __init__(metadata_file)
Initialize metadata reader.
Arguments:
metadata_file- Path to metadata.jsonl or metadata.jsonl.gz
iter_images
def iter_images(quality_field=None, downloaded_ids=None)
Stream images from metadata file with filtering.
Arguments:
quality_field- Optional field to check exists (e.g., ‘thumb_1024_url’)downloaded_ids- Optional set of already downloaded IDs to skip
Yields:
Image metadata dicts that pass filters
get_all_ids
def get_all_ids()
Get set of all image IDs in metadata file.
Returns:
Set of image IDs (for building seen_ids)
mark_complete
@staticmethod
def mark_complete(metadata_file)
Append completion marker to metadata file.
Arguments:
metadata_file- Path to metadata.jsonl
mapillary_downloader.worker
Worker process for parallel image download and conversion.
worker_process
def worker_process(work_queue, result_queue, worker_id)
Worker process that pulls from queue and processes images.
Arguments:
work_queue- Queue to pull work items fromresult_queue- Queue to push results toworker_id- Unique worker identifier
download_and_convert_image
def download_and_convert_image(image_data, output_dir, quality, convert_webp,
session)
Download and optionally convert a single image.
This function is designed to run in a worker process.
Arguments:
image_data- Image metadata dict from APIoutput_dir- Base output directory pathquality- Quality level (256, 1024, 2048, original)convert_webp- Whether to convert to WebPsession- requests.Session with auth already configured
Returns:
Tuple of (image_id, bytes_downloaded, success, error_msg)
mapillary_downloader.logging_config
Logging configuration with colored output for TTY.
ColoredFormatter Objects
class ColoredFormatter(logging.Formatter)
Formatter that adds color to log levels when output is a TTY.
__init__
def __init__(fmt=None, datefmt=None, use_color=True)
Initialize the formatter.
Arguments:
fmt- Log format stringdatefmt- Date format stringuse_color- Whether to use colored output
format
def format(record)
Format the log record with colors if appropriate.
Arguments:
record- LogRecord to format
Returns:
Formatted log string
setup_logging
def setup_logging(level=logging.INFO)
Set up logging with timestamps and colored output.
Arguments:
level- Logging level to use
add_file_handler
def add_file_handler(log_file, level=logging.INFO)
Add a file handler to the logger for archival.
Arguments:
log_file- Path to log filelevel- Logging level for file handler
mapillary_downloader.ia_meta
Internet Archive metadata generation for Mapillary collections.
parse_collection_name
def parse_collection_name(directory)
Parse username and quality from directory name.
Arguments:
directory- Path to collection directory (e.g., mapillary-username-original or mapillary-username-original-webp)
Returns:
Tuple of (username, quality) or (None, None) if parsing fails
get_date_range
def get_date_range(metadata_file)
Get first and last captured_at dates from metadata.jsonl.gz.
Arguments:
metadata_file- Path to metadata.jsonl.gz file
Returns:
Tuple of (first_date, last_date) as ISO format strings, or (None, None)
count_images
def count_images(metadata_file)
Count number of images in metadata.jsonl.gz.
Arguments:
metadata_file- Path to metadata.jsonl.gz file
Returns:
Number of images
write_meta_tag
def write_meta_tag(meta_dir, tag, values)
Write metadata tag files in rip format.
Arguments:
meta_dir- Path to .meta directorytag- Tag namevalues- Single value or list of values
generate_ia_metadata
def generate_ia_metadata(collection_dir)
Generate Internet Archive metadata for a Mapillary collection.
Arguments:
collection_dir- Path to collection directory (e.g., ./mapillary_data/mapillary-username-original)
Returns:
True if successful, False otherwise
mapillary_downloader.worker_pool
Adaptive worker pool for parallel processing.
AdaptiveWorkerPool Objects
class AdaptiveWorkerPool()
Worker pool that scales based on throughput.
Monitors throughput every 30 seconds and adjusts worker count:
- If throughput increasing: add workers (up to max)
- If throughput plateauing/decreasing: reduce workers
__init__
def __init__(worker_func,
min_workers=4,
max_workers=16,
monitoring_interval=10)
Initialize adaptive worker pool.
Arguments:
worker_func- Function to run in each worker (must accept work_queue, result_queue)min_workers- Minimum number of workersmax_workers- Maximum number of workersmonitoring_interval- Seconds between throughput checks
start
def start()
Start the worker pool.
submit
def submit(work_item)
Submit work to the pool (blocks if queue is full).
get_result
def get_result(timeout=None)
Get a result from the workers.
Returns:
Result from worker, or None if timeout
check_throughput
def check_throughput(total_processed)
Check throughput and adjust workers if needed.
Arguments:
total_processed- Total number of items processed so far
shutdown
def shutdown(timeout=2)
Shutdown the worker pool gracefully.
mapillary_downloader.client
Mapillary API client.
MapillaryClient Objects
class MapillaryClient()
Client for interacting with Mapillary API v4.
__init__
def __init__(access_token)
Initialize the client with an access token.
Arguments:
access_token- Mapillary API access token
get_user_images
def get_user_images(username, bbox=None, limit=2000)
Get images uploaded by a specific user.
Arguments:
username- Mapillary usernamebbox- Optional bounding box [west, south, east, north]limit- Number of results per page (max 2000)
Yields:
Image data dictionaries
download_image
def download_image(image_url, output_path)
Download an image from a URL.
Arguments:
image_url- URL of the image to downloadoutput_path- Path to save the image
Returns:
Number of bytes downloaded if successful, 0 otherwise