Data Models Demo¶
This is a demo of data models for database storage of
the submitted data from the darc
crawlers.
Assuming the database is using peewee
as ORM and
MySQL as backend.
Important
For more updated, battlefield-tested version of the
data models, please refer to darc.model.web
.
# -*- coding: utf-8 -*-
# pylint: disable=ungrouped-imports
import enum
import os
from typing import TYPE_CHECKING
import peewee
import playhouse.mysql_ext
import playhouse.shortcuts
if TYPE_CHECKING:
from datetime import datetime
from enum import IntEnum
from typing import Any, Dict, List, Optional
# database client
DB = playhouse.db_url.connect(os.getenv('DB_URL', 'mysql://127.0.0.1'))
class IntEnumField(peewee.IntegerField):
""":class:`enum.IntEnum` data field."""
#: The original :class:`enum.IntEnum` class.
choices: 'IntEnum'
# def db_value(self, value: 'Optional[IntEnum]') -> 'Optional[str]': # pylint: disable=inconsistent-return-statements
# """Dump the value for database storage.
# Args:
# val: Original enumeration object.
# Returns:
# Integral representation of the enumeration.
# """
# if value is not None:
# return value
def python_value(self, value: 'Optional[int]') -> 'Optional[IntEnum]': # pylint: disable=inconsistent-return-statements
"""Load the value from database storage.
Args:
value: Integral representation of the enumeration.
Returns:
Original enumeration object.
"""
if value is not None:
return self.choices(value) # type: ignore
return None
class Proxy(enum.IntEnum):
"""Proxy types supported by :mod:`darc`.
.. _tor2web: https://onion.sh/
"""
#: No proxy.
NULL = enum.auto()
#: Tor proxy.
TOR = enum.auto()
#: I2P proxy.
I2P = enum.auto()
#: ZeroNet proxy.
ZERONET = enum.auto()
#: Freenet proxy.
FREENET = enum.auto()
#: Proxied Tor (`tor2web`_, no proxy).
TOR2WEB = enum.auto()
def table_function(model_class: peewee.Model) -> str:
"""Generate table name dynamically.
The function strips ``Model`` from the class name and
calls :func:`peewee.make_snake_case` to generate a
proper table name.
Args:
model_class: Data model class.
Returns:
Generated table name.
"""
name = model_class.__name__ # type: str
if name.endswith('Model'):
name = name[:-5] # strip ``Model`` suffix
return peewee.make_snake_case(name)
class BaseMeta:
"""Basic metadata for data models."""
#: Reference database storage (c.f. :class:`~darc.const.DB`).
database = DB
#: Generate table name dynamically (c.f. :func:`~darc.model.table_function`).
table_function = table_function
class BaseModel(peewee.Model):
"""Base model with standard patterns.
Notes:
The model will implicitly have a :class:`~peewee.AutoField`
attribute named as :attr:`id`.
"""
#: Basic metadata for data models.
Meta = BaseMeta
def to_dict(self, keep_id: bool = False) -> 'Dict[str, Any]':
"""Convert record to :obj:`dict`.
Args:
keep_id: If keep the ID auto field.
Returns:
The data converted through :func:`playhouse.shortcuts.model_to_dict`.
"""
data = playhouse.shortcuts.model_to_dict(self)
if keep_id:
return data
if 'id' in data:
del data['id']
return data
class HostnameModel(BaseModel):
"""Data model for a hostname record."""
#: Hostname (c.f. :attr:`link.host <darc.link.Link.host>`).
hostname: str = peewee.CharField(max_length=255, unique=True) # a valid FQDN is at most 255 characters
#: Proxy type (c.f. :attr:`link.proxy <darc.link.Link.proxy>`).
proxy: 'Proxy' = IntEnumField(choices=Proxy)
#: Timestamp of first ``new_host`` submission.
discovery: 'datetime' = peewee.DateTimeField()
#: Timestamp of last related submission.
last_seen: 'datetime' = peewee.DateTimeField()
class RobotsModel(BaseModel):
"""Data model for ``robots.txt`` data."""
#: Hostname (c.f. :attr:`link.host <darc.link.Link.host>`).
host: 'HostnameModel' = peewee.ForeignKeyField(HostnameModel, backref='robots')
#: Timestamp of the submission.
timestamp: 'datetime' = peewee.DateTimeField()
#: Document data as :obj:`bytes`.
data: bytes = peewee.BlobField()
#: Path to the document.
path: str = peewee.CharField()
class SitemapModel(BaseModel):
"""Data model for ``sitemap.xml`` data."""
#: Hostname (c.f. :attr:`link.host <darc.link.Link.host>`).
host: 'HostnameModel' = peewee.ForeignKeyField(HostnameModel, backref='sitemaps')
#: Timestamp of the submission.
timestamp: 'datetime' = peewee.DateTimeField()
#: Document data as :obj:`bytes`.
data: bytes = peewee.BlobField()
#: Path to the document.
path: str = peewee.CharField()
class HostsModel(BaseModel):
"""Data model for ``hosts.txt`` data."""
#: Hostname (c.f. :attr:`link.host <darc.link.Link.host>`).
host: 'HostnameModel' = peewee.ForeignKeyField(HostnameModel, backref='hosts')
#: Timestamp of the submission.
timestamp: 'datetime' = peewee.DateTimeField()
#: Document data as :obj:`bytes`.
data: bytes = peewee.BlobField()
#: Path to the document.
path: str = peewee.CharField()
class URLModel(BaseModel):
"""Data model for a requested URL."""
#: Timestamp of last related submission.
last_seen: 'datetime' = peewee.DateTimeField()
#: Original URL (c.f. :attr:`link.url <darc.link.Link.url>`).
url: str = peewee.TextField()
#: Hostname (c.f. :attr:`link.host <darc.link.Link.host>`).
host: HostnameModel = peewee.ForeignKeyField(HostnameModel, backref='urls')
#: Proxy type (c.f. :attr:`link.proxy <darc.link.Link.proxy>`).
proxy: str = peewee.CharField(max_length=8)
#: Base path (c.f. :attr:`link.base <darc.link.Link.base>`).
base: str = peewee.CharField()
#: Link hash (c.f. :attr:`link.name <darc.link.Link.name>`).
name: str = peewee.FixedCharField(max_length=64)
@classmethod
def get_by_url(cls, url: str) -> 'URLModel':
"""Select by URL.
Args:
url: URL to select.
Returns:
Selected URL model.
"""
return cls.get(cls.url == url)
@property
def parents(self) -> 'List[URLModel]':
"""Back reference to where the URL was identified."""
return (URLModel
.select()
.join(URLThroughModel, on=URLThroughModel.parent)
.where(URLThroughModel.child == self)
.order_by(URLModel.url))
@property
def childrent(self) -> 'List[URLModel]':
"""Back reference to which URLs were identified from the URL."""
return (URLModel
.select()
.join(URLThroughModel, on=URLThroughModel.child)
.where(URLThroughModel.parent == self)
.order_by(URLModel.url))
class URLThroughModel(BaseModel):
"""Data model for the map of URL extration chain."""
#: Back reference to where the URL was identified.
parent: 'List[URLModel]' = peewee.ForeignKeyField(URLModel, backref='parents')
#: Back reference to which URLs were identified from the URL.
child: 'List[URLModel]' = peewee.ForeignKeyField(URLModel, backref='children')
class Meta(BaseMeta):
indexes = (
# Specify a unique multi-column index on from/to-url.
(('parent', 'child'), True),
)
class RequestsDocumentModel(BaseModel):
"""Data model for documents from ``requests`` submission."""
#: Original URL (c.f. :attr:`link.url <darc.link.Link.url>`).
url: 'URLModel' = peewee.ForeignKeyField(URLModel, backref='requests')
#: Document data as :obj:`bytes`.
data: bytes = peewee.BlobField()
#: Path to the document.
path: str = peewee.CharField()
class SeleniumDocumentModel(BaseModel):
"""Data model for documents from ``selenium`` submission."""
#: Original URL (c.f. :attr:`link.url <darc.link.Link.url>`).
url: 'URLModel' = peewee.ForeignKeyField(URLModel, backref='selenium')
#: Document data as :obj:`bytes`.
data: bytes = peewee.BlobField()
#: Path to the document.
path: str = peewee.CharField()