Data Models Demo

This is a demo of data models for database storage of the submitted data from the darc crawlers.

Assuming the database is using peewee as ORM and MySQL as backend.

Important

For more updated, battlefield-tested version of the data models, please refer to darc.model.web.

# -*- coding: utf-8 -*-
# pylint: disable=ungrouped-imports

import enum
import os
from typing import TYPE_CHECKING

import peewee
import playhouse.mysql_ext
import playhouse.shortcuts

if TYPE_CHECKING:
    from datetime import datetime
    from enum import IntEnum
    from typing import Any, Dict, List, Optional

# database client
DB = playhouse.db_url.connect(os.getenv('DB_URL', 'mysql://127.0.0.1'))


class IntEnumField(peewee.IntegerField):
    """:class:`enum.IntEnum` data field."""

    #: The original :class:`enum.IntEnum` class.
    choices: 'IntEnum'

    # def db_value(self, value: 'Optional[IntEnum]') -> 'Optional[str]':  # pylint: disable=inconsistent-return-statements
    #     """Dump the value for database storage.

    #     Args:
    #         val: Original enumeration object.

    #     Returns:
    #         Integral representation of the enumeration.

    #     """
    #     if value is not None:
    #         return value

    def python_value(self, value: 'Optional[int]') -> 'Optional[IntEnum]':  # pylint: disable=inconsistent-return-statements
        """Load the value from database storage.

        Args:
            value: Integral representation of the enumeration.

        Returns:
            Original enumeration object.

        """
        if value is not None:
            return self.choices(value)  # type: ignore
        return None


class Proxy(enum.IntEnum):
    """Proxy types supported by :mod:`darc`.

    .. _tor2web: https://onion.sh/
    """

    #: No proxy.
    NULL = enum.auto()
    #: Tor proxy.
    TOR = enum.auto()
    #: I2P proxy.
    I2P = enum.auto()
    #: ZeroNet proxy.
    ZERONET = enum.auto()
    #: Freenet proxy.
    FREENET = enum.auto()
    #: Proxied Tor (`tor2web`_, no proxy).
    TOR2WEB = enum.auto()


def table_function(model_class: peewee.Model) -> str:
    """Generate table name dynamically.

    The function strips ``Model`` from the class name and
    calls :func:`peewee.make_snake_case` to generate a
    proper table name.

    Args:
        model_class: Data model class.

    Returns:
        Generated table name.

    """
    name = model_class.__name__  # type: str
    if name.endswith('Model'):
        name = name[:-5]  # strip ``Model`` suffix
    return peewee.make_snake_case(name)


class BaseMeta:
    """Basic metadata for data models."""

    #: Reference database storage (c.f. :class:`~darc.const.DB`).
    database = DB

    #: Generate table name dynamically (c.f. :func:`~darc.model.table_function`).
    table_function = table_function


class BaseModel(peewee.Model):
    """Base model with standard patterns.

    Notes:
        The model will implicitly have a :class:`~peewee.AutoField`
        attribute named as :attr:`id`.

    """

    #: Basic metadata for data models.
    Meta = BaseMeta

    def to_dict(self, keep_id: bool = False) -> 'Dict[str, Any]':
        """Convert record to :obj:`dict`.

        Args:
            keep_id: If keep the ID auto field.

        Returns:
            The data converted through :func:`playhouse.shortcuts.model_to_dict`.

        """
        data = playhouse.shortcuts.model_to_dict(self)
        if keep_id:
            return data

        if 'id' in data:
            del data['id']
        return data


class HostnameModel(BaseModel):
    """Data model for a hostname record."""

    #: Hostname (c.f. :attr:`link.host <darc.link.Link.host>`).
    hostname: str = peewee.CharField(max_length=255, unique=True)  # a valid FQDN is at most 255 characters
    #: Proxy type (c.f. :attr:`link.proxy <darc.link.Link.proxy>`).
    proxy: 'Proxy' = IntEnumField(choices=Proxy)

    #: Timestamp of first ``new_host`` submission.
    discovery: 'datetime' = peewee.DateTimeField()
    #: Timestamp of last related submission.
    last_seen: 'datetime' = peewee.DateTimeField()


class RobotsModel(BaseModel):
    """Data model for ``robots.txt`` data."""

    #: Hostname (c.f. :attr:`link.host <darc.link.Link.host>`).
    host: 'HostnameModel' = peewee.ForeignKeyField(HostnameModel, backref='robots')
    #: Timestamp of the submission.
    timestamp: 'datetime' = peewee.DateTimeField()

    #: Document data as :obj:`bytes`.
    data: bytes = peewee.BlobField()
    #: Path to the document.
    path: str = peewee.CharField()


class SitemapModel(BaseModel):
    """Data model for ``sitemap.xml`` data."""

    #: Hostname (c.f. :attr:`link.host <darc.link.Link.host>`).
    host: 'HostnameModel' = peewee.ForeignKeyField(HostnameModel, backref='sitemaps')
    #: Timestamp of the submission.
    timestamp: 'datetime' = peewee.DateTimeField()

    #: Document data as :obj:`bytes`.
    data: bytes = peewee.BlobField()
    #: Path to the document.
    path: str = peewee.CharField()


class HostsModel(BaseModel):
    """Data model for ``hosts.txt`` data."""

    #: Hostname (c.f. :attr:`link.host <darc.link.Link.host>`).
    host: 'HostnameModel' = peewee.ForeignKeyField(HostnameModel, backref='hosts')
    #: Timestamp of the submission.
    timestamp: 'datetime' = peewee.DateTimeField()

    #: Document data as :obj:`bytes`.
    data: bytes = peewee.BlobField()
    #: Path to the document.
    path: str = peewee.CharField()


class URLModel(BaseModel):
    """Data model for a requested URL."""

    #: Timestamp of last related submission.
    last_seen: 'datetime' = peewee.DateTimeField()
    #: Original URL (c.f. :attr:`link.url <darc.link.Link.url>`).
    url: str = peewee.TextField()

    #: Hostname (c.f. :attr:`link.host <darc.link.Link.host>`).
    host: HostnameModel = peewee.ForeignKeyField(HostnameModel, backref='urls')
    #: Proxy type (c.f. :attr:`link.proxy <darc.link.Link.proxy>`).
    proxy: str = peewee.CharField(max_length=8)

    #: Base path (c.f. :attr:`link.base <darc.link.Link.base>`).
    base: str = peewee.CharField()
    #: Link hash (c.f. :attr:`link.name <darc.link.Link.name>`).
    name: str = peewee.FixedCharField(max_length=64)

    @classmethod
    def get_by_url(cls, url: str) -> 'URLModel':
        """Select by URL.

        Args:
            url: URL to select.

        Returns:
            Selected URL model.

        """
        return cls.get(cls.url == url)

    @property
    def parents(self) -> 'List[URLModel]':
        """Back reference to where the URL was identified."""
        return (URLModel
                .select()
                .join(URLThroughModel, on=URLThroughModel.parent)
                .where(URLThroughModel.child == self)
                .order_by(URLModel.url))

    @property
    def childrent(self) -> 'List[URLModel]':
        """Back reference to which URLs were identified from the URL."""
        return (URLModel
                .select()
                .join(URLThroughModel, on=URLThroughModel.child)
                .where(URLThroughModel.parent == self)
                .order_by(URLModel.url))


class URLThroughModel(BaseModel):
    """Data model for the map of URL extration chain."""

    #: Back reference to where the URL was identified.
    parent: 'List[URLModel]' = peewee.ForeignKeyField(URLModel, backref='parents')
    #: Back reference to which URLs were identified from the URL.
    child: 'List[URLModel]' = peewee.ForeignKeyField(URLModel, backref='children')

    class Meta(BaseMeta):
        indexes = (
            # Specify a unique multi-column index on from/to-url.
            (('parent', 'child'), True),
        )


class RequestsDocumentModel(BaseModel):
    """Data model for documents from ``requests`` submission."""

    #: Original URL (c.f. :attr:`link.url <darc.link.Link.url>`).
    url: 'URLModel' = peewee.ForeignKeyField(URLModel, backref='requests')

    #: Document data as :obj:`bytes`.
    data: bytes = peewee.BlobField()
    #: Path to the document.
    path: str = peewee.CharField()


class SeleniumDocumentModel(BaseModel):
    """Data model for documents from ``selenium`` submission."""

    #: Original URL (c.f. :attr:`link.url <darc.link.Link.url>`).
    url: 'URLModel' = peewee.ForeignKeyField(URLModel, backref='selenium')

    #: Document data as :obj:`bytes`.
    data: bytes = peewee.BlobField()
    #: Path to the document.
    path: str = peewee.CharField()