Web Backend Demo¶
This is a demo of API for communication between the
darc
crawlers (darc.submit
) and web UI.
See also
Please refer to data schema for more information about the submission data.
Assuming the web UI is developed using the Flask
microframework.
# -*- coding: utf-8 -*-
import sys
from typing import TYPE_CHECKING
import flask
if TYPE_CHECKING:
from flask import Response
# Flask application
app = flask.Flask(__file__)
@app.route('/api/new_host', methods=['POST'])
def new_host() -> 'Response':
"""When a new host is discovered, the :mod:`darc` crawler will submit the
host information. Such includes ``robots.txt`` (if exists) and
``sitemap.xml`` (if any).
Data format::
{
// partial flag - true / false
"$PARTIAL$": ...,
// force flag - true / false
"$FORCE$": ...,
// metadata of URL
"[metadata]": {
// original URL - <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
"url": ...,
// proxy type - null / tor / i2p / zeronet / freenet
"proxy": ...,
// hostname / netloc, c.f. ``urllib.parse.urlparse``
"host": ...,
// base folder, relative path (to data root path ``PATH_DATA``) in containter - <proxy>/<scheme>/<host>
"base": ...,
// sha256 of URL as name for saved files (timestamp is in ISO format)
// JSON log as this one - <base>/<name>_<timestamp>.json
// HTML from requests - <base>/<name>_<timestamp>_raw.html
// HTML from selenium - <base>/<name>_<timestamp>.html
// generic data files - <base>/<name>_<timestamp>.dat
"name": ...,
// originate URL - <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
"backref": ...
},
// requested timestamp in ISO format as in name of saved file
"Timestamp": ...,
// original URL
"URL": ...,
// robots.txt from the host (if not exists, then ``null``)
"Robots": {
// path of the file, relative path (to data root path ``PATH_DATA``) in container
// - <proxy>/<scheme>/<host>/robots.txt
"path": ...,
// content of the file (**base64** encoded)
"data": ...,
},
// sitemaps from the host (if none, then ``null``)
"Sitemaps": [
{
// path of the file, relative path (to data root path ``PATH_DATA``) in container
// - <proxy>/<scheme>/<host>/sitemap_<name>.xml
"path": ...,
// content of the file (**base64** encoded)
"data": ...,
},
...
],
// hosts.txt from the host (if proxy type is ``i2p``; if not exists, then ``null``)
"Hosts": {
// path of the file, relative path (to data root path ``PATH_DATA``) in container
// - <proxy>/<scheme>/<host>/hosts.txt
"path": ...,
// content of the file (**base64** encoded)
"data": ...,
}
}
"""
# JSON data from the request
data = flask.request.json # pylint: disable=unused-variable
# do whatever processing needed
...
return flask.make_response()
@app.route('/api/requests', methods=['POST'])
def from_requests() -> 'Response':
"""When crawling, we'll first fetch the URl using ``requests``, to check
its availability and to save its HTTP headers information. Such information
will be submitted to the web UI.
Data format::
{
// metadata of URL
"[metadata]": {
// original URL - <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
"url": ...,
// proxy type - null / tor / i2p / zeronet / freenet
"proxy": ...,
// hostname / netloc, c.f. ``urllib.parse.urlparse``
"host": ...,
// base folder, relative path (to data root path ``PATH_DATA``) in containter - <proxy>/<scheme>/<host>
"base": ...,
// sha256 of URL as name for saved files (timestamp is in ISO format)
// JSON log as this one - <base>/<name>_<timestamp>.json
// HTML from requests - <base>/<name>_<timestamp>_raw.html
// HTML from selenium - <base>/<name>_<timestamp>.html
// generic data files - <base>/<name>_<timestamp>.dat
"name": ...,
// originate URL - <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
"backref": ...
},
// requested timestamp in ISO format as in name of saved file
"Timestamp": ...,
// original URL
"URL": ...,
// request method
"Method": "GET",
// response status code
"Status-Code": ...,
// response reason
"Reason": ...,
// response cookies (if any)
"Cookies": {
...
},
// session cookies (if any)
"Session": {
...
},
// request headers (if any)
"Request": {
...
},
// response headers (if any)
"Response": {
...
},
// content type
"Content-Type": ...,
// requested file (if not exists, then ``null``)
"Document": {
// path of the file, relative path (to data root path ``PATH_DATA``) in container
// - <proxy>/<scheme>/<host>/<name>_<timestamp>_raw.html
// or if the document is of generic content type, i.e. not HTML
// - <proxy>/<scheme>/<host>/<name>_<timestamp>.dat
"path": ...,
// content of the file (**base64** encoded)
"data": ...,
},
// redirection history (if any)
"History": [
// same records as the original response
{"...": "..."}
]
}
"""
# JSON data from the request
data = flask.request.json # pylint: disable=unused-variable
# do whatever processing needed
...
return flask.make_response()
@app.route('/api/selenium', methods=['POST'])
def from_selenium() -> 'Response':
"""After crawling with ``requests``, we'll then render the URl using
``selenium`` with Google Chrome and its driver, to provide a fully rendered
web page. Such information will be submitted to the web UI.
Note:
This information is optional, only provided if the content type from
``requests`` is HTML, status code < 400, and HTML data not empty.
Data format::
{
// metadata of URL
"[metadata]": {
// original URL - <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
"url": ...,
// proxy type - null / tor / i2p / zeronet / freenet
"proxy": ...,
// hostname / netloc, c.f. ``urllib.parse.urlparse``
"host": ...,
// base folder, relative path (to data root path ``PATH_DATA``) in containter - <proxy>/<scheme>/<host>
"base": ...,
// sha256 of URL as name for saved files (timestamp is in ISO format)
// JSON log as this one - <base>/<name>_<timestamp>.json
// HTML from requests - <base>/<name>_<timestamp>_raw.html
// HTML from selenium - <base>/<name>_<timestamp>.html
// generic data files - <base>/<name>_<timestamp>.dat
"name": ...,
// originate URL - <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
"backref": ...
},
// requested timestamp in ISO format as in name of saved file
"Timestamp": ...,
// original URL
"URL": ...,
// rendered HTML document (if not exists, then ``null``)
"Document": {
// path of the file, relative path (to data root path ``PATH_DATA``) in container
// - <proxy>/<scheme>/<host>/<name>_<timestamp>.html
"path": ...,
// content of the file (**base64** encoded)
"data": ...,
},
// web page screenshot (if not exists, then ``null``)
"Screenshot": {
// path of the file, relative path (to data root path ``PATH_DATA``) in container
// - <proxy>/<scheme>/<host>/<name>_<timestamp>.png
"path": ...,
// content of the file (**base64** encoded)
"data": ...,
}
}
"""
# JSON data from the request
data = flask.request.json # pylint: disable=unused-variable
# do whatever processing needed
...
return flask.make_response()
if __name__ == "__main__":
sys.exit(app.run()) # type: ignore[func-returns-value]