Submission Data Schema

To better describe the submitted data, darc provides several JSON schema generated from pydantic models.

New Host Submission

The data submission from darc.submit.submit_new_host().

{
  "title": "new_host",
  "description": "Data submission from :func:`darc.submit.submit_new_host`.",
  "type": "object",
  "properties": {
    "$PARTIAL$": {
      "title": "$Partial$",
      "description": "partial flag - true / false",
      "type": "boolean"
    },
    "$RELOAD$": {
      "title": "$Reload$",
      "description": "reload flag - true / false",
      "type": "boolean"
    },
    "[metadata]": {
      "title": "[Metadata]",
      "description": "metadata of URL",
      "allOf": [
        {
          "$ref": "#/definitions/Metadata"
        }
      ]
    },
    "Timestamp": {
      "title": "Timestamp",
      "description": "requested timestamp in ISO format as in name of saved file",
      "type": "string",
      "format": "date-time"
    },
    "URL": {
      "title": "Url",
      "description": "original URL",
      "minLength": 1,
      "maxLength": 65536,
      "format": "uri",
      "type": "string"
    },
    "Robots": {
      "title": "Robots",
      "description": "robots.txt from the host (if not exists, then ``null``)",
      "allOf": [
        {
          "$ref": "#/definitions/RobotsDocument"
        }
      ]
    },
    "Sitemaps": {
      "title": "Sitemaps",
      "description": "sitemaps from the host (if none, then ``null``)",
      "type": "array",
      "items": {
        "$ref": "#/definitions/SitemapDocument"
      }
    },
    "Hosts": {
      "title": "Hosts",
      "description": "hosts.txt from the host (if proxy type is ``i2p``; if not exists, then ``null``)",
      "allOf": [
        {
          "$ref": "#/definitions/HostsDocument"
        }
      ]
    }
  },
  "required": [
    "$PARTIAL$",
    "$RELOAD$",
    "[metadata]",
    "Timestamp",
    "URL"
  ],
  "definitions": {
    "Proxy": {
      "title": "Proxy",
      "description": "Proxy type.",
      "enum": [
        "null",
        "tor",
        "i2p",
        "zeronet",
        "freenet"
      ],
      "type": "string"
    },
    "Metadata": {
      "title": "metadata",
      "description": "Metadata of URL.",
      "type": "object",
      "properties": {
        "url": {
          "title": "Url",
          "description": "original URL - <scheme>://<netloc>/<path>;<params>?<query>#<fragment>",
          "minLength": 1,
          "maxLength": 65536,
          "format": "uri",
          "type": "string"
        },
        "proxy": {
          "$ref": "#/definitions/Proxy"
        },
        "host": {
          "title": "Host",
          "description": "hostname / netloc, c.f. ``urllib.parse.urlparse``",
          "type": "string"
        },
        "base": {
          "title": "Base",
          "description": "base folder, relative path (to data root path ``PATH_DATA``) in containter - <proxy>/<scheme>/<host>",
          "type": "string"
        },
        "name": {
          "title": "Name",
          "description": "sha256 of URL as name for saved files (timestamp is in ISO format) - JSON log as this one: <base>/<name>_<timestamp>.json; - HTML from requests: <base>/<name>_<timestamp>_raw.html; - HTML from selenium: <base>/<name>_<timestamp>.html; - generic data files: <base>/<name>_<timestamp>.dat",
          "type": "string"
        }
      },
      "required": [
        "url",
        "proxy",
        "host",
        "base",
        "name"
      ]
    },
    "RobotsDocument": {
      "title": "RobotsDocument",
      "description": "``robots.txt`` document data.",
      "type": "object",
      "properties": {
        "path": {
          "title": "Path",
          "description": "path of the file, relative path (to data root path ``PATH_DATA``) in container - <proxy>/<scheme>/<host>/robots.txt",
          "type": "string"
        },
        "data": {
          "title": "Data",
          "description": "content of the file (**base64** encoded)",
          "type": "string"
        }
      },
      "required": [
        "path",
        "data"
      ]
    },
    "SitemapDocument": {
      "title": "SitemapDocument",
      "description": "Sitemaps document data.",
      "type": "object",
      "properties": {
        "path": {
          "title": "Path",
          "description": "path of the file, relative path (to data root path ``PATH_DATA``) in container - <proxy>/<scheme>/<host>/sitemap_<name>.xml",
          "type": "string"
        },
        "data": {
          "title": "Data",
          "description": "content of the file (**base64** encoded)",
          "type": "string"
        }
      },
      "required": [
        "path",
        "data"
      ]
    },
    "HostsDocument": {
      "title": "HostsDocument",
      "description": "``hosts.txt`` document data.",
      "type": "object",
      "properties": {
        "path": {
          "title": "Path",
          "description": "path of the file, relative path (to data root path ``PATH_DATA``) in container - <proxy>/<scheme>/<host>/hosts.txt",
          "type": "string"
        },
        "data": {
          "title": "Data",
          "description": "content of the file (**base64** encoded)",
          "type": "string"
        }
      },
      "required": [
        "path",
        "data"
      ]
    }
  }
}

Requests Submission

The data submission from darc.submit.submit_requests().

{
  "title": "requests",
  "description": "Data submission from :func:`darc.submit.submit_requests`.",
  "type": "object",
  "properties": {
    "$PARTIAL$": {
      "title": "$Partial$",
      "description": "partial flag - true / false",
      "type": "boolean"
    },
    "[metadata]": {
      "title": "[Metadata]",
      "description": "metadata of URL",
      "allOf": [
        {
          "$ref": "#/definitions/Metadata"
        }
      ]
    },
    "Timestamp": {
      "title": "Timestamp",
      "description": "requested timestamp in ISO format as in name of saved file",
      "type": "string",
      "format": "date-time"
    },
    "URL": {
      "title": "Url",
      "description": "original URL",
      "minLength": 1,
      "maxLength": 65536,
      "format": "uri",
      "type": "string"
    },
    "Method": {
      "title": "Method",
      "description": "request method",
      "type": "string"
    },
    "Status-Code": {
      "title": "Status-Code",
      "description": "response status code",
      "exclusiveMinimum": 0,
      "type": "integer"
    },
    "Reason": {
      "title": "Reason",
      "description": "response reason",
      "type": "string"
    },
    "Cookies": {
      "title": "Cookies",
      "description": "response cookies (if any)",
      "type": "object",
      "additionalProperties": {
        "type": "string"
      }
    },
    "Session": {
      "title": "Session",
      "description": "session cookies (if any)",
      "type": "object",
      "additionalProperties": {
        "type": "string"
      }
    },
    "Request": {
      "title": "Request",
      "description": "request headers (if any)",
      "type": "object",
      "additionalProperties": {
        "type": "string"
      }
    },
    "Response": {
      "title": "Response",
      "description": "response headers (if any)",
      "type": "object",
      "additionalProperties": {
        "type": "string"
      }
    },
    "Content-Type": {
      "title": "Content-Type",
      "description": "content type",
      "pattern": "[a-zA-Z0-9.-]+/[a-zA-Z0-9.-]+",
      "type": "string"
    },
    "Document": {
      "title": "Document",
      "description": "requested file (if not exists, then ``null``)",
      "allOf": [
        {
          "$ref": "#/definitions/RequestsDocument"
        }
      ]
    },
    "History": {
      "title": "History",
      "description": "redirection history (if any)",
      "type": "array",
      "items": {
        "$ref": "#/definitions/HistoryModel"
      }
    }
  },
  "required": [
    "$PARTIAL$",
    "[metadata]",
    "Timestamp",
    "URL",
    "Method",
    "Status-Code",
    "Reason",
    "Cookies",
    "Session",
    "Request",
    "Response",
    "Content-Type",
    "History"
  ],
  "definitions": {
    "Proxy": {
      "title": "Proxy",
      "description": "Proxy type.",
      "enum": [
        "null",
        "tor",
        "i2p",
        "zeronet",
        "freenet"
      ],
      "type": "string"
    },
    "Metadata": {
      "title": "metadata",
      "description": "Metadata of URL.",
      "type": "object",
      "properties": {
        "url": {
          "title": "Url",
          "description": "original URL - <scheme>://<netloc>/<path>;<params>?<query>#<fragment>",
          "minLength": 1,
          "maxLength": 65536,
          "format": "uri",
          "type": "string"
        },
        "proxy": {
          "$ref": "#/definitions/Proxy"
        },
        "host": {
          "title": "Host",
          "description": "hostname / netloc, c.f. ``urllib.parse.urlparse``",
          "type": "string"
        },
        "base": {
          "title": "Base",
          "description": "base folder, relative path (to data root path ``PATH_DATA``) in containter - <proxy>/<scheme>/<host>",
          "type": "string"
        },
        "name": {
          "title": "Name",
          "description": "sha256 of URL as name for saved files (timestamp is in ISO format) - JSON log as this one: <base>/<name>_<timestamp>.json; - HTML from requests: <base>/<name>_<timestamp>_raw.html; - HTML from selenium: <base>/<name>_<timestamp>.html; - generic data files: <base>/<name>_<timestamp>.dat",
          "type": "string"
        }
      },
      "required": [
        "url",
        "proxy",
        "host",
        "base",
        "name"
      ]
    },
    "RequestsDocument": {
      "title": "RequestsDocument",
      "description": ":mod:`requests` document data.",
      "type": "object",
      "properties": {
        "path": {
          "title": "Path",
          "description": "path of the file, relative path (to data root path ``PATH_DATA``) in container - <proxy>/<scheme>/<host>/<name>_<timestamp>_raw.html; or if the document is of generic content type, i.e. not HTML - <proxy>/<scheme>/<host>/<name>_<timestamp>.dat",
          "type": "string"
        },
        "data": {
          "title": "Data",
          "description": "content of the file (**base64** encoded)",
          "type": "string"
        }
      },
      "required": [
        "path",
        "data"
      ]
    },
    "HistoryModel": {
      "title": "HistoryModel",
      "description": ":mod:`requests` history data.",
      "type": "object",
      "properties": {
        "URL": {
          "title": "Url",
          "description": "original URL",
          "minLength": 1,
          "maxLength": 65536,
          "format": "uri",
          "type": "string"
        },
        "Method": {
          "title": "Method",
          "description": "request method",
          "type": "string"
        },
        "Status-Code": {
          "title": "Status-Code",
          "description": "response status code",
          "exclusiveMinimum": 0,
          "type": "integer"
        },
        "Reason": {
          "title": "Reason",
          "description": "response reason",
          "type": "string"
        },
        "Cookies": {
          "title": "Cookies",
          "description": "response cookies (if any)",
          "type": "object",
          "additionalProperties": {
            "type": "string"
          }
        },
        "Session": {
          "title": "Session",
          "description": "session cookies (if any)",
          "type": "object",
          "additionalProperties": {
            "type": "string"
          }
        },
        "Request": {
          "title": "Request",
          "description": "request headers (if any)",
          "type": "object",
          "additionalProperties": {
            "type": "string"
          }
        },
        "Response": {
          "title": "Response",
          "description": "response headers (if any)",
          "type": "object",
          "additionalProperties": {
            "type": "string"
          }
        },
        "Document": {
          "title": "Document",
          "description": "content of the file (**base64** encoded)",
          "type": "string"
        }
      },
      "required": [
        "URL",
        "Method",
        "Status-Code",
        "Reason",
        "Cookies",
        "Session",
        "Request",
        "Response",
        "Document"
      ]
    }
  }
}

Selenium Submission

The data submission from darc.submit.submit_selenium().

{
  "title": "selenium",
  "description": "Data submission from :func:`darc.submit.submit_requests`.",
  "type": "object",
  "properties": {
    "$PARTIAL$": {
      "title": "$Partial$",
      "description": "partial flag - true / false",
      "type": "boolean"
    },
    "[metadata]": {
      "title": "[Metadata]",
      "description": "metadata of URL",
      "allOf": [
        {
          "$ref": "#/definitions/Metadata"
        }
      ]
    },
    "Timestamp": {
      "title": "Timestamp",
      "description": "requested timestamp in ISO format as in name of saved file",
      "type": "string",
      "format": "date-time"
    },
    "URL": {
      "title": "Url",
      "description": "original URL",
      "minLength": 1,
      "maxLength": 65536,
      "format": "uri",
      "type": "string"
    },
    "Document": {
      "title": "Document",
      "description": "rendered HTML document (if not exists, then ``null``)",
      "allOf": [
        {
          "$ref": "#/definitions/SeleniumDocument"
        }
      ]
    },
    "Screenshot": {
      "title": "Screenshot",
      "description": "web page screenshot (if not exists, then ``null``)",
      "allOf": [
        {
          "$ref": "#/definitions/ScreenshotDocument"
        }
      ]
    }
  },
  "required": [
    "$PARTIAL$",
    "[metadata]",
    "Timestamp",
    "URL"
  ],
  "definitions": {
    "Proxy": {
      "title": "Proxy",
      "description": "Proxy type.",
      "enum": [
        "null",
        "tor",
        "i2p",
        "zeronet",
        "freenet"
      ],
      "type": "string"
    },
    "Metadata": {
      "title": "metadata",
      "description": "Metadata of URL.",
      "type": "object",
      "properties": {
        "url": {
          "title": "Url",
          "description": "original URL - <scheme>://<netloc>/<path>;<params>?<query>#<fragment>",
          "minLength": 1,
          "maxLength": 65536,
          "format": "uri",
          "type": "string"
        },
        "proxy": {
          "$ref": "#/definitions/Proxy"
        },
        "host": {
          "title": "Host",
          "description": "hostname / netloc, c.f. ``urllib.parse.urlparse``",
          "type": "string"
        },
        "base": {
          "title": "Base",
          "description": "base folder, relative path (to data root path ``PATH_DATA``) in containter - <proxy>/<scheme>/<host>",
          "type": "string"
        },
        "name": {
          "title": "Name",
          "description": "sha256 of URL as name for saved files (timestamp is in ISO format) - JSON log as this one: <base>/<name>_<timestamp>.json; - HTML from requests: <base>/<name>_<timestamp>_raw.html; - HTML from selenium: <base>/<name>_<timestamp>.html; - generic data files: <base>/<name>_<timestamp>.dat",
          "type": "string"
        }
      },
      "required": [
        "url",
        "proxy",
        "host",
        "base",
        "name"
      ]
    },
    "SeleniumDocument": {
      "title": "SeleniumDocument",
      "description": ":mod:`selenium` document data.",
      "type": "object",
      "properties": {
        "path": {
          "title": "Path",
          "description": "path of the file, relative path (to data root path ``PATH_DATA``) in container - <proxy>/<scheme>/<host>/<name>_<timestamp>.html",
          "type": "string"
        },
        "data": {
          "title": "Data",
          "description": "content of the file (**base64** encoded)",
          "type": "string"
        }
      },
      "required": [
        "path",
        "data"
      ]
    },
    "ScreenshotDocument": {
      "title": "ScreenshotDocument",
      "description": "Screenshot document data.",
      "type": "object",
      "properties": {
        "path": {
          "title": "Path",
          "description": "path of the file, relative path (to data root path ``PATH_DATA``) in container - <proxy>/<scheme>/<host>/<name>_<timestamp>.png",
          "type": "string"
        },
        "data": {
          "title": "Data",
          "description": "content of the file (**base64** encoded)",
          "type": "string"
        }
      },
      "required": [
        "path",
        "data"
      ]
    }
  }
}

Model Definitions

# -*- coding: utf-8 -*-
# pylint: disable=ungrouped-imports,no-name-in-module
"""JSON schema generator."""

from typing import TYPE_CHECKING

import pydantic.schema
from pydantic import BaseModel, Field

__all__ = ['NewHostModel', 'RequestsModel', 'SeleniumModel']

if TYPE_CHECKING:
    from datetime import datetime
    from enum import Enum
    from typing import Any, Dict, List, Optional

    from pydantic import AnyUrl, PositiveInt

    CookiesType = List[Dict[str, Any]]
    HeadersType = Dict[str, str]

    class Proxy(str, Enum):
        """Proxy type."""

        null = 'null'
        tor = 'tor'
        i2p = 'i2p'
        zeronet = 'zeronet'
        freenet = 'freenet'

###############################################################################
# Miscellaneous auxiliaries
###############################################################################


class Metadata(BaseModel):
    """Metadata of URL."""

    url: 'AnyUrl' = Field(
        description='original URL - <scheme>://<netloc>/<path>;<params>?<query>#<fragment>')
    proxy: 'Proxy' = Field(
        description='proxy type - null / tor / i2p / zeronet / freenet')
    host: str = Field(
        description='hostname / netloc, c.f. ``urllib.parse.urlparse``')
    base: str = Field(
        description=('base folder, relative path (to data root path ``PATH_DATA``) in containter '
                     '- <proxy>/<scheme>/<host>'))
    name: str = Field(
        description=('sha256 of URL as name for saved files (timestamp is in ISO format) '
                     '- JSON log as this one: <base>/<name>_<timestamp>.json; '
                     '- HTML from requests: <base>/<name>_<timestamp>_raw.html; '
                     '- HTML from selenium: <base>/<name>_<timestamp>.html; '
                     '- generic data files: <base>/<name>_<timestamp>.dat'))
    backref: str = Field(
        description='originate URL - <scheme>://<netloc>/<path>;<params>?<query>#<fragment>')

    class Config:
        title = 'metadata'


class RobotsDocument(BaseModel):
    """``robots.txt`` document data."""

    path: str = Field(
        description=('path of the file, relative path (to data root path ``PATH_DATA``) in container '
                     '- <proxy>/<scheme>/<host>/robots.txt'))
    data: str = Field(
        description='content of the file (**base64** encoded)')


class SitemapDocument(BaseModel):
    """Sitemaps document data."""

    path: str = Field(
        description=('path of the file, relative path (to data root path ``PATH_DATA``) in container '
                     '- <proxy>/<scheme>/<host>/sitemap_<name>.xml'))
    data: str = Field(
        description='content of the file (**base64** encoded)')


class HostsDocument(BaseModel):
    """``hosts.txt`` document data."""

    path: str = Field(
        description=('path of the file, relative path (to data root path ``PATH_DATA``) in container '
                     '- <proxy>/<scheme>/<host>/hosts.txt'))
    data: str = Field(
        description='content of the file (**base64** encoded)')


class RequestsDocument(BaseModel):
    """:mod:`requests` document data."""

    path: str = Field(
        description=('path of the file, relative path (to data root path ``PATH_DATA``) in container '
                     '- <proxy>/<scheme>/<host>/<name>_<timestamp>_raw.html; '
                     'or if the document is of generic content type, i.e. not HTML '
                     '- <proxy>/<scheme>/<host>/<name>_<timestamp>.dat'))
    data: str = Field(
        description='content of the file (**base64** encoded)')


class HistoryModel(BaseModel):
    """:mod:`requests` history data."""

    URL: 'AnyUrl' = Field(
        description='original URL')

    Method: str = Field(
        description='request method')
    status_code: 'PositiveInt' = Field(
        alias='Status-Code',
        description='response status code')
    Reason: str = Field(
        description='response reason')

    Cookies: 'CookiesType' = Field(
        description='response cookies (if any)')
    Session: 'CookiesType' = Field(
        description='session cookies (if any)')

    Request: 'HeadersType' = Field(
        description='request headers (if any)')
    Response: 'HeadersType' = Field(
        description='response headers (if any)')

    Document: str = Field(
        description='content of the file (**base64** encoded)')


class SeleniumDocument(BaseModel):
    """:mod:`selenium` document data."""

    path: str = Field(
        description=('path of the file, relative path (to data root path ``PATH_DATA``) in container '
                     '- <proxy>/<scheme>/<host>/<name>_<timestamp>.html'))
    data: str = Field(
        description='content of the file (**base64** encoded)')


class ScreenshotDocument(BaseModel):
    """Screenshot document data."""

    path: str = Field(
        description=('path of the file, relative path (to data root path ``PATH_DATA``) in container '
                     '- <proxy>/<scheme>/<host>/<name>_<timestamp>.png'))
    data: str = Field(
        description='content of the file (**base64** encoded)')


###############################################################################
# JSON schema definitions
###############################################################################


class NewHostModel(BaseModel):
    """Data submission from :func:`darc.submit.submit_new_host`."""

    partial: bool = Field(
        alias='$PARTIAL$',
        description='partial flag - true / false')
    reload: bool = Field(
        alias='$RELOAD$',
        description='reload flag - true / false')
    metadata: 'Metadata' = Field(
        alias='[metadata]',
        description='metadata of URL')

    Timestamp: 'datetime' = Field(
        description='requested timestamp in ISO format as in name of saved file')
    URL: 'AnyUrl' = Field(
        description='original URL')

    Robots: 'Optional[RobotsDocument]' = Field(
        description='robots.txt from the host (if not exists, then ``null``)')
    Sitemaps: 'Optional[List[SitemapDocument]]' = Field(
        description='sitemaps from the host (if none, then ``null``)')
    Hosts: 'Optional[HostsDocument]' = Field(
        description='hosts.txt from the host (if proxy type is ``i2p``; if not exists, then ``null``)')


    class Config:
        title = 'new_host'


class RequestsModel(BaseModel):
    """Data submission from :func:`darc.submit.submit_requests`."""

    partial: bool = Field(
        alias='$PARTIAL$',
        description='partial flag - true / false')
    metadata: 'Metadata' = Field(
        alias='[metadata]',
        description='metadata of URL')

    Timestamp: 'datetime' = Field(
        description='requested timestamp in ISO format as in name of saved file')
    URL: 'AnyUrl' = Field(
        description='original URL')

    Method: str = Field(
        description='request method')
    status_code: 'PositiveInt' = Field(
        alias='Status-Code',
        description='response status code')
    Reason: str = Field(
        description='response reason')

    Cookies: 'CookiesType' = Field(
        description='response cookies (if any)')
    Session: 'CookiesType' = Field(
        description='session cookies (if any)')

    Request: 'HeadersType' = Field(
        description='request headers (if any)')
    Response: 'HeadersType' = Field(
        description='response headers (if any)')
    content_type: str = Field(
        alias='Content-Type',
        regex='[a-zA-Z0-9.-]+/[a-zA-Z0-9.-]+',
        description='content type')

    Document: 'Optional[RequestsDocument]' = Field(
        description='requested file (if not exists, then ``null``)')
    History: 'List[HistoryModel]' = Field(
        description='redirection history (if any)')

    class Config:
        title = 'requests'


class SeleniumModel(BaseModel):
    """Data submission from :func:`darc.submit.submit_requests`."""

    partial: bool = Field(
        alias='$PARTIAL$',
        description='partial flag - true / false')
    metadata: 'Metadata' = Field(
        alias='[metadata]',
        description='metadata of URL')

    Timestamp: 'datetime' = Field(
        description='requested timestamp in ISO format as in name of saved file')
    URL: 'AnyUrl' = Field(
        description='original URL')

    Document: 'Optional[SeleniumDocument]' = Field(
        description='rendered HTML document (if not exists, then ``null``)')
    Screenshot: 'Optional[ScreenshotDocument]' = Field(
        description='web page screenshot (if not exists, then ``null``)')

    class Config:
        title = 'selenium'


if __name__ == "__main__":
    import json
    import os

    os.makedirs('schema', exist_ok=True)

    with open('schema/new_host.schema.json', 'w') as file:
        print(NewHostModel.schema_json(indent=2), file=file)
    with open('schema/requests.schema.json', 'w') as file:
        print(RequestsModel.schema_json(indent=2), file=file)
    with open('schema/selenium.schema.json', 'w') as file:
        print(SeleniumModel.schema_json(indent=2), file=file)

    schema = pydantic.schema.schema([NewHostModel, RequestsModel, SeleniumModel],
                                    title='DARC Data Submission JSON Schema')
    with open('schema/darc.schema.json', 'w') as file:
        json.dump(schema, file, indent=2)