Initial import of cleaned up documentation.
This commit is contained in:
parent
447531c0fd
commit
7d1984b4d4
|
@ -0,0 +1,19 @@
|
|||
|
||||
docpreview:
|
||||
$(MAKE) -C sphinx html
|
||||
cp -r ../bindings/java/javadoc sphinx/.out/html
|
||||
cp -r ..//bindings/go/godoc sphinx/.out/html
|
||||
rm -f sphinx/.out/html/documentation
|
||||
ln -s . sphinx/.out/html/documentation
|
||||
@( cd sphinx/.out/html && ! grep FIXME * && python -m SimpleHTTPServer $$((0x$$(echo ${USER} | $(MD5SUM) | awk '{print $$1}' | cut -c1-8)%8000+8000)) )
|
||||
|
||||
docpreview_clean:
|
||||
$(MAKE) sphinx clean
|
||||
|
||||
docpackage:
|
||||
$(MAKE) -C sphinx html
|
||||
cp -r ../../foundationdb/bindings/java/javadoc sphinx/.out/html
|
||||
cp -r ../../foundationdb/bindings/go/godoc sphinx/.out/html
|
||||
rm -f sphinx/.out/html/documentation
|
||||
ln -s . sphinx/.out/html/documentation
|
||||
$(MAKE) -C sphinx package
|
|
@ -0,0 +1,461 @@
|
|||
{
|
||||
"cluster":{
|
||||
"layers":{
|
||||
"_valid":true,
|
||||
"_error":"some error description"
|
||||
},
|
||||
"processes":{
|
||||
"$map":{
|
||||
"version":"3.0.0",
|
||||
"machine_id":"0ccb4e0feddb5583010f6b77d9d10ece",
|
||||
"locality":{
|
||||
"$map":"value"
|
||||
},
|
||||
"class_source":{
|
||||
"$enum":[
|
||||
"command_line",
|
||||
"configure_auto",
|
||||
"set_class"
|
||||
]
|
||||
},
|
||||
"class_type":{
|
||||
"$enum":[
|
||||
"unset",
|
||||
"storage",
|
||||
"transaction",
|
||||
"resolution",
|
||||
"proxy",
|
||||
"master",
|
||||
"test"
|
||||
]
|
||||
},
|
||||
"roles":[
|
||||
{
|
||||
"query_queue_max":0,
|
||||
"input_bytes":{
|
||||
"hz":0.0,
|
||||
"counter":0,
|
||||
"roughness":0.0
|
||||
},
|
||||
"kvstore_used_bytes":12341234,
|
||||
"stored_bytes":12341234,
|
||||
"kvstore_free_bytes":12341234,
|
||||
"durable_bytes":{
|
||||
"hz":0.0,
|
||||
"counter":0,
|
||||
"roughness":0.0
|
||||
},
|
||||
"queue_disk_free_bytes":12341234,
|
||||
"persistent_disk_used_bytes":12341234,
|
||||
"role":{
|
||||
"$enum":[
|
||||
"master",
|
||||
"proxy",
|
||||
"log",
|
||||
"storage",
|
||||
"resolver",
|
||||
"cluster_controller"
|
||||
]
|
||||
},
|
||||
"data_version":12341234,
|
||||
"data_version_lag":12341234,
|
||||
"persistent_disk_total_bytes":12341234,
|
||||
"queue_disk_total_bytes":12341234,
|
||||
"persistent_disk_free_bytes":12341234,
|
||||
"queue_disk_used_bytes":12341234,
|
||||
"id":"eb84471d68c12d1d26f692a50000003f",
|
||||
"kvstore_total_bytes":12341234,
|
||||
"finished_queries":{
|
||||
"hz":0.0,
|
||||
"counter":0,
|
||||
"roughness":0.0
|
||||
}
|
||||
}
|
||||
],
|
||||
"command_line":"-r simulation",
|
||||
"memory":{
|
||||
"available_bytes":0,
|
||||
"limit_bytes":0,
|
||||
"used_bytes":0
|
||||
},
|
||||
"messages":[
|
||||
{
|
||||
"time":12345.12312,
|
||||
"type":"x",
|
||||
"name":{
|
||||
"$enum":[
|
||||
"file_open_error",
|
||||
"incorrect_cluster_file_contents",
|
||||
"process_error",
|
||||
"io_error",
|
||||
"io_timeout",
|
||||
"platform_error",
|
||||
"storage_server_lagging",
|
||||
"(other FDB error messages)"
|
||||
]
|
||||
},
|
||||
"raw_log_message":"<stuff/>",
|
||||
"description":"abc"
|
||||
}
|
||||
],
|
||||
"fault_domain":"0ccb4e0fdbdb5583010f6b77d9d10ece",
|
||||
"excluded":false,
|
||||
"address":"1.2.3.4:1234",
|
||||
"disk":{
|
||||
"free_bytes":3451233456234,
|
||||
"reads":{
|
||||
"hz":0.0,
|
||||
"counter":0,
|
||||
"sectors":0
|
||||
},
|
||||
"busy":0.0,
|
||||
"writes":{
|
||||
"hz":0.0,
|
||||
"counter":0,
|
||||
"sectors":0
|
||||
},
|
||||
"total_bytes":123412341234
|
||||
},
|
||||
"uptime_seconds":1234.2345,
|
||||
"cpu":{
|
||||
"usage_cores":0.0
|
||||
},
|
||||
"network":{
|
||||
"current_connections":0,
|
||||
"connections_established":{
|
||||
"hz":0.0
|
||||
},
|
||||
"connections_closed":{
|
||||
"hz":0.0
|
||||
},
|
||||
"connection_errors":{
|
||||
"hz":0.0
|
||||
},
|
||||
"megabits_sent":{
|
||||
"hz":0.0
|
||||
},
|
||||
"megabits_received":{
|
||||
"hz":0.0
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"old_logs":[
|
||||
{
|
||||
"logs":[
|
||||
{
|
||||
"id":"7f8d623d0cb9966e",
|
||||
"healthy":true,
|
||||
"address":"1.2.3.4:1234"
|
||||
}
|
||||
],
|
||||
"log_replication_factor":3,
|
||||
"log_write_anti_quorum":0,
|
||||
"log_fault_tolerance":2
|
||||
}
|
||||
],
|
||||
"fault_tolerance":{
|
||||
"max_machine_failures_without_losing_availability":0,
|
||||
"max_machine_failures_without_losing_data":0
|
||||
},
|
||||
"qos":{
|
||||
"worst_queue_bytes_log_server":460,
|
||||
"performance_limited_by":{
|
||||
"reason_server_id":"7f8d623d0cb9966e",
|
||||
"reason_id":0,
|
||||
"name":{
|
||||
"$enum":[
|
||||
"workload",
|
||||
"storage_server_write_queue_size",
|
||||
"storage_server_write_bandwidth_mvcc",
|
||||
"storage_server_readable_behind",
|
||||
"log_server_mvcc_write_bandwidth",
|
||||
"log_server_write_queue",
|
||||
"storage_server_min_free_space",
|
||||
"storage_server_min_free_space_ratio",
|
||||
"log_server_min_free_space",
|
||||
"log_server_min_free_space_ratio"
|
||||
]
|
||||
},
|
||||
"description":"The database is not being saturated by the workload."
|
||||
},
|
||||
"transactions_per_second_limit":0,
|
||||
"released_transactions_per_second":0,
|
||||
"limiting_queue_bytes_storage_server":0,
|
||||
"worst_queue_bytes_storage_server":0,
|
||||
"limiting_version_lag_storage_server":0,
|
||||
"worst_version_lag_storage_server":0
|
||||
},
|
||||
"incompatible_connections":[
|
||||
|
||||
],
|
||||
"database_available":true,
|
||||
"database_locked":false,
|
||||
"generation":2,
|
||||
"latency_probe":{
|
||||
"read_seconds":7,
|
||||
"immediate_priority_transaction_start_seconds":0.0,
|
||||
"batch_priority_transaction_start_seconds":0.0,
|
||||
"transaction_start_seconds":0.0,
|
||||
"commit_seconds":0.02
|
||||
},
|
||||
"clients":{
|
||||
"count":1,
|
||||
"supported_versions":[
|
||||
{
|
||||
"client_version":"3.0.0",
|
||||
"connected_clients":[
|
||||
{
|
||||
"address":"127.0.0.1:9898",
|
||||
"log_group":"default"
|
||||
}
|
||||
],
|
||||
"count" : 1,
|
||||
"protocol_version" : "fdb00a400050001",
|
||||
"source_version" : "9430e1127b4991cbc5ab2b17f41cfffa5de07e9d"
|
||||
}
|
||||
]
|
||||
},
|
||||
"messages":[
|
||||
{
|
||||
"reasons":[
|
||||
{
|
||||
"description":"Blah."
|
||||
}
|
||||
],
|
||||
"unreachable_processes":[
|
||||
{
|
||||
"address":"1.2.3.4:1234"
|
||||
}
|
||||
],
|
||||
"name":{
|
||||
"$enum":[
|
||||
"unreachable_master_worker",
|
||||
"unreadable_configuration",
|
||||
"client_issues",
|
||||
"unreachable_processes",
|
||||
"immediate_priority_transaction_start_probe_timeout",
|
||||
"batch_priority_transaction_start_probe_timeout",
|
||||
"transaction_start_probe_timeout",
|
||||
"read_probe_timeout",
|
||||
"commit_probe_timeout",
|
||||
"storage_servers_error",
|
||||
"status_incomplete",
|
||||
"layer_status_incomplete",
|
||||
"database_availability_timeout"
|
||||
]
|
||||
},
|
||||
"issues":[
|
||||
{
|
||||
"name":{
|
||||
"$enum":[
|
||||
"incorrect_cluster_file_contents"
|
||||
]
|
||||
},
|
||||
"description":"Cluster file contents do not match current cluster connection string. Verify cluster file is writable and has not been overwritten externally."
|
||||
}
|
||||
],
|
||||
"description":"abc"
|
||||
}
|
||||
],
|
||||
"recovery_state":{
|
||||
"required_resolvers":1,
|
||||
"required_proxies":1,
|
||||
"name":{
|
||||
"$enum":[
|
||||
"reading_coordinated_state",
|
||||
"locking_coordinated_state",
|
||||
"locking_old_transaction_servers",
|
||||
"reading_transaction_system_state",
|
||||
"configuration_missing",
|
||||
"configuration_never_created",
|
||||
"configuration_invalid",
|
||||
"recruiting_transaction_servers",
|
||||
"initializing_transaction_servers",
|
||||
"recovery_transaction",
|
||||
"writing_coordinated_state",
|
||||
"fully_recovered"
|
||||
]
|
||||
},
|
||||
"required_logs":3,
|
||||
"missing_logs":"7f8d623d0cb9966e",
|
||||
"description":"Recovery complete."
|
||||
},
|
||||
"workload":{
|
||||
"operations":{
|
||||
"writes":{
|
||||
"hz":0.0,
|
||||
"counter":0,
|
||||
"roughness":0.0
|
||||
},
|
||||
"reads":{
|
||||
"hz":0.0
|
||||
}
|
||||
},
|
||||
"bytes":{
|
||||
"written":{
|
||||
"hz":0.0,
|
||||
"counter":0,
|
||||
"roughness":0.0
|
||||
}
|
||||
},
|
||||
"transactions":{
|
||||
"started":{
|
||||
"hz":0.0,
|
||||
"counter":0,
|
||||
"roughness":0.0
|
||||
},
|
||||
"conflicted":{
|
||||
"hz":0.0,
|
||||
"counter":0,
|
||||
"roughness":0.0
|
||||
},
|
||||
"committed":{
|
||||
"hz":0.0,
|
||||
"counter":0,
|
||||
"roughness":0.0
|
||||
}
|
||||
}
|
||||
},
|
||||
"cluster_controller_timestamp":1415650089,
|
||||
"protocol_version":"fdb00a400050001",
|
||||
"configuration":{
|
||||
"resolvers":1,
|
||||
"redundancy":{
|
||||
"factor":{
|
||||
"$enum":[
|
||||
"single",
|
||||
"double",
|
||||
"triple",
|
||||
"custom",
|
||||
"two_datacenter",
|
||||
"three_datacenter",
|
||||
"three_data_hall",
|
||||
"fast_recovery_double",
|
||||
"fast_recovery_triple"
|
||||
]
|
||||
}
|
||||
},
|
||||
"storage_policy":"(zoneid^3x1)",
|
||||
"tlog_policy":"(zoneid^2x1)",
|
||||
"logs":2,
|
||||
"storage_engine":{
|
||||
"$enum":[
|
||||
"ssd",
|
||||
"ssd-1",
|
||||
"ssd-2",
|
||||
"memory",
|
||||
"custom"
|
||||
]
|
||||
},
|
||||
"coordinators_count":1,
|
||||
"excluded_servers":[
|
||||
{
|
||||
"address":"10.0.4.1"
|
||||
}
|
||||
],
|
||||
"proxies":5
|
||||
},
|
||||
"data":{
|
||||
"least_operating_space_bytes_log_server":0,
|
||||
"average_partition_size_bytes":0,
|
||||
"state":{
|
||||
"healthy":true,
|
||||
"min_replicas_remaining":0,
|
||||
"name":{
|
||||
"$enum":[
|
||||
"initializing",
|
||||
"missing_data",
|
||||
"healing",
|
||||
"healthy_repartitioning",
|
||||
"healthy_removing_server",
|
||||
"healthy_rebalancing",
|
||||
"healthy"
|
||||
]
|
||||
},
|
||||
"description":""
|
||||
},
|
||||
"least_operating_space_ratio_storage_server":0.1,
|
||||
"max_machine_failures_without_losing_availability":0,
|
||||
"total_disk_used_bytes":0,
|
||||
"total_kv_size_bytes":0,
|
||||
"partitions_count":2,
|
||||
"moving_data":{
|
||||
"total_written_bytes":0,
|
||||
"in_flight_bytes":0,
|
||||
"in_queue_bytes":0
|
||||
},
|
||||
"least_operating_space_bytes_storage_server":0,
|
||||
"max_machine_failures_without_losing_data":0
|
||||
},
|
||||
"machines":{
|
||||
"$map":{
|
||||
"network":{
|
||||
"megabits_sent":{
|
||||
"hz":0.0
|
||||
},
|
||||
"megabits_received":{
|
||||
"hz":0.0
|
||||
},
|
||||
"tcp_segments_retransmitted":{
|
||||
"hz":0.0
|
||||
}
|
||||
},
|
||||
"memory":{
|
||||
"free_bytes":0,
|
||||
"committed_bytes":0,
|
||||
"total_bytes":0
|
||||
},
|
||||
"contributing_workers":4,
|
||||
"datacenter_id":"6344abf1813eb05b",
|
||||
"excluded":false,
|
||||
"address":"1.2.3.4",
|
||||
"machine_id":"6344abf1813eb05b",
|
||||
"locality":{
|
||||
"$map":"value"
|
||||
},
|
||||
"cpu":{
|
||||
"logical_core_utilization":0.4
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"client":{
|
||||
"coordinators":{
|
||||
"coordinators":[
|
||||
{
|
||||
"reachable":true,
|
||||
"address":"127.0.0.1:4701"
|
||||
}
|
||||
],
|
||||
"quorum_reachable":true
|
||||
},
|
||||
"database_status":{
|
||||
"available":true,
|
||||
"healthy":true
|
||||
},
|
||||
"messages":[
|
||||
{
|
||||
"name":{
|
||||
"$enum":[
|
||||
"inconsistent_cluster_file",
|
||||
"unreachable_cluster_controller",
|
||||
"no_cluster_controller",
|
||||
"status_incomplete_client",
|
||||
"status_incomplete_coordinators",
|
||||
"status_incomplete_error",
|
||||
"status_incomplete_timeout",
|
||||
"status_incomplete_cluster",
|
||||
"quorum_not_reachable"
|
||||
]
|
||||
},
|
||||
"description":"The cluster file is not up to date."
|
||||
}
|
||||
],
|
||||
"timestamp":1415650089,
|
||||
"cluster_file":{
|
||||
"path":"/etc/foundationdb/fdb.cluster",
|
||||
"up_to_date":true
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,11 @@
|
|||
.out
|
||||
.dist
|
||||
.DS_Store
|
||||
|
||||
_templates/buildinfo.html
|
||||
|
||||
# Vim
|
||||
*.swp
|
||||
|
||||
# Intellij IDEA
|
||||
.idea/
|
|
@ -0,0 +1,3 @@
|
|||
[global]
|
||||
timeout = 60
|
||||
index-url = https://pypi.python.org/simple
|
|
@ -0,0 +1,116 @@
|
|||
# Makefile for Sphinx documentation
|
||||
#
|
||||
# local vars:
|
||||
PROJECT_NAME := foundationdb-docs
|
||||
#VERSION := $(shell cat version)
|
||||
|
||||
ifeq ($(RELEASE_BUILD),true)
|
||||
RELEASE := $(VERSION)
|
||||
else
|
||||
RELEASE := $(VERSION)-SNAPSHOT
|
||||
endif
|
||||
|
||||
GIT_HEAD_REF := $(shell git rev-parse --short HEAD)
|
||||
GIT_BRANCH := $(shell git symbolic-ref --short HEAD)
|
||||
GIT_REPO_URL := $(shell git config --get remote.origin.url)
|
||||
|
||||
USER ?= $(shell whoami)
|
||||
MD5SUM ?= md5sum
|
||||
|
||||
# You can set these variables from the command line.
|
||||
#VERSIONOPTS := -D version=$(VERSION) -D release=$(RELEASE)
|
||||
SPHINXOPTS := -c .
|
||||
PAPER =
|
||||
ROOTDIR := $(CURDIR)
|
||||
BUILDDIR := $(ROOTDIR)/.out
|
||||
DISTDIR := $(ROOTDIR)/.dist
|
||||
VENVDIR := $(ROOTDIR)/.out/venv
|
||||
SPHINXBUILD = $(VENVDIR)/bin/sphinx-build
|
||||
SPHINXAUTOBUILD = $(VENVDIR)/bin/sphinx-autobuild
|
||||
TEMPLATEDIR = $(ROOTDIR)/_templates
|
||||
|
||||
# virtualenv for sphinx-build
|
||||
VENV_VERSION = virtualenv-13.0.1
|
||||
VENV_URL = https://pypi.python.org/packages/source/v/virtualenv/$(VENV_VERSION).tar.gz
|
||||
|
||||
# Internal variables.
|
||||
PAPEROPT_a4 = -D latex_paper_size=a4
|
||||
PAPEROPT_letter = -D latex_paper_size=letter
|
||||
ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
|
||||
# the i18n builder cannot share the environment and doctrees with the others
|
||||
I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
|
||||
|
||||
BUILDINFO = "<small><hr/><table><tr><th>Ref:</th><td><a href='$(GIT_REPO_URL)commit/%H'><em>%h</em></a></td></tr><tr><th>Updated:</th><td><em>%cd</em></td></tr><tr><th>Committer:</th><td><a href='mailto:%ce'>%cn</a></td></tr></table><br/><center><em><a href='$(GIT_REPO_URL)'>View on GitHub</a></em></center><hr/></small>"
|
||||
|
||||
.PHONY: default help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext buildsphinx publish uptodate
|
||||
|
||||
default: html
|
||||
|
||||
help:
|
||||
@echo "Please use \`make <target>' where <target> is one of"
|
||||
@echo " html to make standalone HTML files"
|
||||
@echo " livehtml to launch a local webserver that auto-updates as changes are made"
|
||||
@echo " publish to build the html and push it to GitHub pages"
|
||||
@echo " dirhtml to make HTML files named index.html in directories"
|
||||
@echo " singlehtml to make a single large HTML file"
|
||||
@echo " pickle to make pickle files"
|
||||
@echo " json to make JSON files"
|
||||
@echo " htmlhelp to make HTML files and a HTML help project"
|
||||
@echo " qthelp to make HTML files and a qthelp project"
|
||||
@echo " devhelp to make HTML files and a Devhelp project"
|
||||
@echo " epub to make an epub"
|
||||
@echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
|
||||
@echo " latexpdf to make LaTeX files and run them through pdflatex"
|
||||
@echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
|
||||
@echo " text to make text files"
|
||||
@echo " man to make manual pages"
|
||||
@echo " texinfo to make Texinfo files"
|
||||
@echo " info to make Texinfo files and run them through makeinfo"
|
||||
@echo " gettext to make PO message catalogs"
|
||||
@echo " changes to make an overview of all changed/added/deprecated items"
|
||||
@echo " xml to make Docutils-native XML files"
|
||||
@echo " pseudoxml to make pseudoxml-XML files for display purposes"
|
||||
@echo " linkcheck to check all external links for integrity"
|
||||
@echo " doctest to run all doctests embedded in the documentation (if enabled)"
|
||||
@echo " buildsphinx to install sphinx binary in virtualenv"
|
||||
|
||||
buildsphinx:
|
||||
if [ ! -e $(SPHINXBUILD) ]; then \
|
||||
mkdir $(BUILDDIR); \
|
||||
cd $(BUILDDIR); \
|
||||
curl -O $(VENV_URL); \
|
||||
tar zxvf $(VENV_VERSION).tar.gz; \
|
||||
./$(VENV_VERSION)/virtualenv.py venv; \
|
||||
fi
|
||||
. $(VENVDIR)/bin/activate && \
|
||||
cp .pip.conf $(VENVDIR)/pip.conf && \
|
||||
pip install --upgrade pip && \
|
||||
pip install --upgrade -r $(ROOTDIR)/requirements.txt;
|
||||
|
||||
clean:
|
||||
rm -rf $(BUILDDIR)
|
||||
|
||||
cleanhtml:
|
||||
rm -rf $(BUILDDIR)/html
|
||||
|
||||
cleanvirtualenv:
|
||||
rm -rf $(VENVDIR)
|
||||
|
||||
html: buildsphinx cleanhtml
|
||||
$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
|
||||
@echo
|
||||
@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
|
||||
|
||||
check: checkwarnings linkcheck
|
||||
|
||||
checkwarnings: buildsphinx
|
||||
$(SPHINXBUILD) -n -W -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
|
||||
@echo "Check finished."
|
||||
|
||||
livehtml: html
|
||||
$(SPHINXAUTOBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
|
||||
|
||||
# removed html prerequisite because it is previously explictly invoked
|
||||
package:
|
||||
mkdir -p $(DISTDIR)
|
||||
cd $(BUILDDIR)/html && tar czf $(DISTDIR)/$(PROJECT_NAME)-$(RELEASE).tar.gz .
|
|
@ -0,0 +1,272 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
#
|
||||
# FoundationDB documentation build configuration file
|
||||
#
|
||||
# This file is execfile()d with the current directory set to its containing dir.
|
||||
#
|
||||
# Note that not all possible configuration values are present in this
|
||||
# autogenerated file.
|
||||
#
|
||||
# All configuration values have a default; values that are commented out
|
||||
# serve to show the default.
|
||||
|
||||
import sphinx_bootstrap_theme
|
||||
import sys
|
||||
import os
|
||||
|
||||
# If extensions (or modules to document with autodoc) are in another directory,
|
||||
# add these directories to sys.path here. If the directory is relative to the
|
||||
# documentation root, use os.path.abspath to make it absolute, like shown here.
|
||||
sys.path.insert(0, os.path.abspath('extensions'))
|
||||
|
||||
# -- General configuration -----------------------------------------------------
|
||||
|
||||
# If your documentation needs a minimal Sphinx version, state it here.
|
||||
#needs_sphinx = '1.0'
|
||||
|
||||
# Add any Sphinx extension module names here, as strings. They can be extensions
|
||||
# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
|
||||
extensions = [
|
||||
'sphinx.ext.intersphinx',
|
||||
'sphinx.ext.todo',
|
||||
'sphinx.ext.ifconfig',
|
||||
'brokenrole',
|
||||
'relativelink'
|
||||
]
|
||||
|
||||
# Add any paths that contain templates here, relative to this directory.
|
||||
templates_path = [sys.prefix + '/_templates']
|
||||
|
||||
# The suffix of source filenames.
|
||||
source_suffix = '.rst'
|
||||
|
||||
# The encoding of source files.
|
||||
#source_encoding = 'utf-8-sig'
|
||||
|
||||
# The master toctree document.
|
||||
master_doc = 'index'
|
||||
|
||||
# General information about the project.
|
||||
project = u'FoundationDB'
|
||||
copyright = u'2015 Apple, Inc. All Rights Reserved.'
|
||||
author = u"Stephen Pimentel"
|
||||
|
||||
# Load the version information from 'versions.target'
|
||||
import xml.etree.ElementTree as ET
|
||||
version_path = os.path.join(os.path.dirname(__file__), '..', '..', '..', 'foundationdb', 'versions.target')
|
||||
tree = ET.parse(version_path)
|
||||
root = tree.getroot()
|
||||
|
||||
# The version info for the project you're documenting, acts as replacement for
|
||||
# |version| and |release|, also used in various other places throughout the
|
||||
# built documents.
|
||||
#
|
||||
# The short X.Y version.
|
||||
version = root.find(".//{http://schemas.microsoft.com/developer/msbuild/2003}PackageName").text
|
||||
# The full version, including alpha/beta/rc tags.
|
||||
# FoundationDB special note: also see guide-common.rst.inc and update the link to the EC2 template
|
||||
release = root.find(".//{http://schemas.microsoft.com/developer/msbuild/2003}Version").text
|
||||
|
||||
# The language for content autogenerated by Sphinx. Refer to documentation
|
||||
# for a list of supported languages.
|
||||
language = None
|
||||
|
||||
# There are two options for replacing |today|: either, you set today to some
|
||||
# non-false value, then it is used:
|
||||
#today = ''
|
||||
# Else, today_fmt is used as the format for a strftime call.
|
||||
#today_fmt = '%B %d, %Y'
|
||||
|
||||
# List of patterns, relative to source directory, that match files and
|
||||
# directories to ignore when looking for source files.
|
||||
exclude_patterns = []
|
||||
|
||||
# The reST default role (used for this markup: `text`) to use for all documents.
|
||||
#default_role = None
|
||||
|
||||
# If true, '()' will be appended to :func: etc. cross-reference text.
|
||||
#add_function_parentheses = True
|
||||
|
||||
# If true, the current module name will be prepended to all description
|
||||
# unit titles (such as .. function::).
|
||||
#add_module_names = True
|
||||
|
||||
# If true, sectionauthor and moduleauthor directives will be shown in the
|
||||
# output. They are ignored by default.
|
||||
#show_authors = False
|
||||
|
||||
# The name of the Pygments (syntax highlighting) style to use.
|
||||
pygments_style = 'solarizedlight'
|
||||
|
||||
# A list of ignored prefixes for module index sorting.
|
||||
#modindex_common_prefix = []
|
||||
|
||||
# If true, `todo` and `todoList` produce output, else they produce nothing.
|
||||
todo_include_todos = False
|
||||
|
||||
# -- Options for HTML output ---------------------------------------------------
|
||||
|
||||
# The theme to use for HTML and HTML Help pages. See the documentation for
|
||||
# a list of builtin themes.
|
||||
html_theme = 'bootstrap'
|
||||
|
||||
# Add any paths that contain custom themes here, relative to this directory.
|
||||
html_theme_path = sphinx_bootstrap_theme.get_html_theme_path()
|
||||
|
||||
# Theme options are theme-specific and customize the look and feel of a theme
|
||||
# further. For a list of options available for each theme, see the
|
||||
# documentation.
|
||||
html_theme_options = {
|
||||
'globaltoc_depth': 2,
|
||||
'globaltoc_includehidden': "true",
|
||||
'navbar_links': [
|
||||
("Site Map", "contents"),
|
||||
],
|
||||
'source_link_position': "footer",
|
||||
}
|
||||
|
||||
# The name for this set of Sphinx documents. If None, it defaults to
|
||||
# "<project> v<release> documentation".
|
||||
html_title = 'FoundationDB ' + version
|
||||
|
||||
# A shorter title for the navigation bar. Default is the same as html_title.
|
||||
#html_short_title = None
|
||||
|
||||
# The name of an image file (relative to this directory) to place at the top
|
||||
# of the sidebar.
|
||||
html_logo = sys.prefix + '/_static/logo.svg'
|
||||
|
||||
# The name of an image file (within the static path) to use as favicon of the
|
||||
# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
|
||||
# pixels large.
|
||||
#html_favicon = None
|
||||
|
||||
# Add any paths that contain custom static files (such as style sheets) here,
|
||||
# relative to this directory. They are copied after the builtin static files,
|
||||
# so a file named "default.css" will overwrite the builtin "default.css".
|
||||
html_static_path = [sys.prefix + '/_static']
|
||||
|
||||
# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
|
||||
# using the given strftime format.
|
||||
html_last_updated_fmt = '%b %d, %Y'
|
||||
|
||||
# If true, SmartyPants will be used to convert quotes and dashes to
|
||||
# typographically correct entities.
|
||||
#html_use_smartypants = True
|
||||
|
||||
# Custom sidebar templates, maps document names to template names.
|
||||
html_sidebars = {
|
||||
'**': ['localtoc.html'],
|
||||
'contents': [],
|
||||
}
|
||||
|
||||
# Additional templates that should be rendered to pages, maps page names to
|
||||
# template names.
|
||||
#html_additional_pages = {}
|
||||
|
||||
# If false, no module index is generated.
|
||||
html_domain_indices = False
|
||||
|
||||
# If false, no index is generated.
|
||||
html_use_index = False
|
||||
|
||||
# If true, the index is split into individual pages for each letter.
|
||||
#html_split_index = False
|
||||
|
||||
# If true, links to the reST sources are added to the pages.
|
||||
#html_show_sourcelink = True
|
||||
|
||||
# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
|
||||
html_show_sphinx = False
|
||||
|
||||
# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
|
||||
html_show_copyright = True
|
||||
|
||||
# This is the file name suffix for HTML files (e.g. ".xhtml").
|
||||
#html_file_suffix = None
|
||||
|
||||
# Output file base name for HTML help builder.
|
||||
htmlhelp_basename = 'FoundationDB'
|
||||
|
||||
# Disable permalinks
|
||||
html_add_permalinks = ""
|
||||
|
||||
|
||||
# -- Options for LaTeX output --------------------------------------------------
|
||||
|
||||
latex_elements = {
|
||||
# The paper size ('letterpaper' or 'a4paper').
|
||||
#'papersize': 'letterpaper',
|
||||
|
||||
# The font size ('10pt', '11pt' or '12pt').
|
||||
#'pointsize': '10pt',
|
||||
|
||||
# Additional stuff for the LaTeX preamble.
|
||||
#'preamble': '',
|
||||
}
|
||||
|
||||
# Grouping the document tree into LaTeX files. List of tuples
|
||||
# (source start file, target name, title, author, documentclass [howto/manual]).
|
||||
latex_documents = [
|
||||
('index', 'FoundationDB.tex', u'FoundationDB Documentation',
|
||||
u'FoundationDB', 'manual'),
|
||||
]
|
||||
|
||||
# The name of an image file (relative to this directory) to place at the top of
|
||||
# the title page.
|
||||
#latex_logo = None
|
||||
|
||||
# For "manual" documents, if this is true, then toplevel headings are parts,
|
||||
# not chapters.
|
||||
#latex_use_parts = False
|
||||
|
||||
# If true, show page references after internal links.
|
||||
#latex_show_pagerefs = False
|
||||
|
||||
# If true, show URL addresses after external links.
|
||||
#latex_show_urls = False
|
||||
|
||||
# Documents to append as an appendix to all manuals.
|
||||
#latex_appendices = []
|
||||
|
||||
# If false, no module index is generated.
|
||||
#latex_domain_indices = True
|
||||
|
||||
|
||||
# -- Options for manual page output --------------------------------------------
|
||||
|
||||
# One entry per manual page. List of tuples
|
||||
# (source start file, name, description, authors, manual section).
|
||||
man_pages = [
|
||||
('index', 'foundationdb', u'FoundationDB Documentation',
|
||||
[u'FoundationDB'], 1)
|
||||
]
|
||||
|
||||
# If true, show URL addresses after external links.
|
||||
#man_show_urls = False
|
||||
|
||||
|
||||
# -- Options for Texinfo output ------------------------------------------------
|
||||
|
||||
# Grouping the document tree into Texinfo files. List of tuples
|
||||
# (source start file, target name, title, author,
|
||||
# dir menu entry, description, category)
|
||||
texinfo_documents = [
|
||||
('index', 'FoundationDB', u'FoundationDB Documentation',
|
||||
u'FoundationDB', 'FoundationDB', 'One line description of project.',
|
||||
'Miscellaneous'),
|
||||
]
|
||||
|
||||
# Documents to append as an appendix to all manuals.
|
||||
#texinfo_appendices = []
|
||||
|
||||
# If false, no module index is generated.
|
||||
#texinfo_domain_indices = True
|
||||
|
||||
# How to display URL addresses: 'footnote', 'no', or 'inline'.
|
||||
#texinfo_show_urls = 'footnote'
|
||||
|
||||
|
||||
########### Check for inappropriate use of the default role ##########
|
||||
|
||||
default_role = "broken"
|
|
@ -0,0 +1,27 @@
|
|||
#
|
||||
# brokenrole.py
|
||||
#
|
||||
# This source file is part of the FoundationDB open source project
|
||||
#
|
||||
# Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
def setup(app):
|
||||
app.add_role('broken', broken_role)
|
||||
|
||||
def broken_role(name, rawtext, text, lineno, inliner, options={}, content=[]):
|
||||
msg = inliner.reporter.error('Broken role invoked', line=lineno)
|
||||
prb = inliner.problematic(rawtext,rawtext,msg)
|
||||
return [prb],[msg]
|
|
@ -0,0 +1,48 @@
|
|||
#
|
||||
# relativelink.py
|
||||
#
|
||||
# This source file is part of the FoundationDB open source project
|
||||
#
|
||||
# Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from sphinx.addnodes import toctree
|
||||
|
||||
# This extension cruelly monkey patches sphinx.environment.BuildEnvironment so
|
||||
# that toctree entries can contain relative internal links, using the syntax
|
||||
# Name <relative://relative/path>
|
||||
# This is translated into an href="relative/path"
|
||||
|
||||
# Relative links already work fine outside the toctree:
|
||||
|
||||
# Name <relative/path>_
|
||||
|
||||
def setup(app):
|
||||
import sphinx.environment
|
||||
from docutils import nodes
|
||||
|
||||
old_resolve = sphinx.environment.BuildEnvironment.resolve_toctree
|
||||
def resolve_toctree(self, docname, builder, toctree, prune=True, maxdepth=0,
|
||||
titles_only=False, collapse=False, includehidden=False):
|
||||
result = old_resolve(self, docname, builder, toctree, prune=True, maxdepth=0,
|
||||
titles_only=False, collapse=False, includehidden=False)
|
||||
if result == None:
|
||||
return result
|
||||
|
||||
for node in result.traverse( nodes.reference ):
|
||||
if not node['internal'] and node['refuri'].startswith("relative://"):
|
||||
node['refuri'] = node['refuri'][len("relative://"):]
|
||||
return result
|
||||
sphinx.environment.BuildEnvironment.resolve_toctree = resolve_toctree
|
|
@ -0,0 +1,4 @@
|
|||
--index-url https://pypi.python.org/simple
|
||||
sphinx==1.5.6
|
||||
sphinx-bootstrap-theme==0.4.8
|
||||
pygments-style-solarized
|
|
@ -0,0 +1,577 @@
|
|||
##############
|
||||
Administration
|
||||
##############
|
||||
|
||||
.. include:: guide-common.rst.inc
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
:hidden:
|
||||
:titlesonly:
|
||||
|
||||
moving-a-cluster
|
||||
|
||||
This document covers the administration of an existing FoundationDB cluster. We recommend you read this document before setting up a cluster for performance testing or production use.
|
||||
|
||||
.. note:: In FoundationDB, a "cluster" refers to one or more FoundationDB processes spread across one or more physical machines that together host a FoundationDB database.
|
||||
|
||||
To administer an externally accessible cluster, you need to understand basic system tasks. You should begin with how to :ref:`start and stop the database <administration-running-foundationdb>`. Next, you should review management of a cluster, including :ref:`adding <adding-machines-to-a-cluster>` and :ref:`removing <removing-machines-from-a-cluster>` machines, and monitoring :ref:`cluster status <administration-monitoring-cluster-status>` and the basic :ref:`server processes <administration_fdbmonitor>`. You should be familiar with :ref:`managing trace files <administration-managing-trace-files>` and :ref:`other administrative concerns <administration-other-administrative-concerns>`. Finally, you should know how to :ref:`uninstall <administration-removing>` or :ref:`upgrade <upgrading-foundationdb>` the database.
|
||||
|
||||
.. _administration-running-foundationdb:
|
||||
|
||||
Starting and stopping
|
||||
=====================
|
||||
|
||||
After installation, FoundationDB is set to start automatically. You can manually start and stop the database with the commands shown below.
|
||||
|
||||
These commands start and stop the master ``fdbmonitor`` process, which in turn starts ``fdbserver`` and ``backup-agent`` processes. See :ref:`administration_fdbmonitor` for details.
|
||||
|
||||
Linux
|
||||
-----
|
||||
|
||||
On Linux, FoundationDB is started and stopped using the ``service`` command as follows::
|
||||
|
||||
user@host$ sudo service foundationdb start
|
||||
user@host$ sudo service foundationdb stop
|
||||
|
||||
On Ubuntu, it can be prevented from starting at boot as follows (without stopping the service)::
|
||||
|
||||
user@host$ sudo update-rc.d foundationdb disable
|
||||
|
||||
On RHEL/CentOS, it can be prevented from starting at boot as follows (without stopping the service)::
|
||||
|
||||
user@host$ sudo chkconfig foundationdb off
|
||||
|
||||
macOS
|
||||
-----
|
||||
|
||||
On macOS, FoundationDB is started and stopped using ``launchctl`` as follows::
|
||||
|
||||
host:~ user$ sudo launchctl load /Library/LaunchDaemons/com.foundationdb.fdbmonitor.plist
|
||||
host:~ user$ sudo launchctl unload /Library/LaunchDaemons/com.foundationdb.fdbmonitor.plist
|
||||
|
||||
It can be stopped and prevented from starting at boot as follows::
|
||||
|
||||
host:~ user$ sudo launchctl unload -w /Library/LaunchDaemons/com.foundationdb.fdbmonitor.plist
|
||||
|
||||
.. _foundationdb-cluster-file:
|
||||
|
||||
Cluster files
|
||||
=============
|
||||
|
||||
FoundationDB servers and clients use a cluster file (usually named ``fdb.cluster``) to connect to a cluster. The contents of the cluster file are the same for all processes that connect to the cluster. An ``fdb.cluster`` file is created automatically when you install a FoundationDB server and updated automatically when you :ref:`change coordination servers <configuration-choosing-coordination-servers>`. To connect to a cluster from a client machine, you will need access to a copy of the cluster file used by the servers in the cluster. Typically, you will copy the ``fdb.cluster`` file from the :ref:`default location <default-cluster-file>` on a FoundationDB server to the default location on each client.
|
||||
|
||||
.. warning:: This file should not normally be modified manually. To change coordination servers, see :ref:`configuration-choosing-coordination-servers`.
|
||||
|
||||
.. _default-cluster-file:
|
||||
|
||||
Default cluster file
|
||||
--------------------
|
||||
|
||||
When you initially install FoundationDB, a default ``fdb.cluster`` file will be placed at a system-dependent location:
|
||||
|
||||
* Linux: ``/etc/foundationdb/fdb.cluster``
|
||||
* macOS: ``/usr/local/etc/foundationdb/fdb.cluster``
|
||||
|
||||
.. _specifying-a-cluster-file:
|
||||
|
||||
Specifying the cluster file
|
||||
---------------------------
|
||||
|
||||
All FoundationDB components can be configured to use a specified cluster file:
|
||||
|
||||
* The ``fdbcli`` tool allows a cluster file to be passed on the command line using the ``-C`` option.
|
||||
* The :doc:`client APIs <api-reference>` allow a cluster file to be passed when connecting to a cluster, usually via ``open()`` or ``create_cluster()``.
|
||||
* A FoundationDB server or ``backup-agent`` allow a cluster file to be specified in :ref:`foundationdb.conf <foundationdb-conf>`.
|
||||
|
||||
In addition, FoundationDB allows you to use the environment variable ``FDB_CLUSTER_FILE`` to specify a cluster file. This approach is helpful if you operate or access more than one cluster.
|
||||
|
||||
All FoundationDB components will determine a cluster file in the following order:
|
||||
|
||||
1. An explicitly provided file, whether a command line argument using ``-C`` or an argument to an API function, if one is given;
|
||||
2. The value of the ``FDB_CLUSTER_FILE`` environment variable, if it has been set;
|
||||
3. An ``fdb.cluster`` file in the current working directory, if one is present;
|
||||
4. The :ref:`default file <default-cluster-file>` at its system-dependent location.
|
||||
|
||||
This automatic determination of a cluster file makes it easy to write code using FoundationDB without knowing exactly where it will be installed or what database it will need to connect to.
|
||||
|
||||
.. warning:: A cluster file must have the :ref:`required permissions <cluster_file_permissions>` in order to be used.
|
||||
|
||||
.. warning:: If an explicitly provided file has been set to an invalid value (such as an empty value, a file that does not exist, or a file that is not a valid cluster file), an error will result. FoundationDB will not fall back to another file.
|
||||
|
||||
.. warning:: If ``FDB_CLUSTER_FILE`` is read and has been set to an invalid value (such as an empty value, a file that does not exist, or a file that is not a valid cluster file), an error will result. FoundationDB will not fall back to another file.
|
||||
|
||||
.. _cluster_file_permissions:
|
||||
|
||||
Required Permissions
|
||||
--------------------
|
||||
|
||||
FoundationDB servers and clients require read *and* write access to the cluster file and its parent directory. This is because certain administrative changes to the cluster configuration (see :ref:`configuration-choosing-coordination-servers`) can cause this file to be automatically modified by all servers and clients using the cluster. If a FoundationDB process cannot update the cluster file, it may eventually become unable to connect to the cluster.
|
||||
|
||||
.. _cluster-file-format:
|
||||
|
||||
Cluster file format
|
||||
-------------------
|
||||
|
||||
The cluster file contains a connection string consisting of a cluster identifier and a comma-separated list of IP addresses (not hostnames) specifying the coordination servers. The format for the file is::
|
||||
|
||||
description:ID@IP:PORT,IP:PORT,...
|
||||
|
||||
* |cluster-file-rule1|
|
||||
* |cluster-file-rule2|
|
||||
* |cluster-file-rule3|
|
||||
|
||||
Together the ``description`` and the ``ID`` should uniquely identify a FoundationDB cluster.
|
||||
|
||||
A cluster file may contain comments, marked by the ``#`` character. All characters on a line after the first occurance of a ``#`` will be ignored.
|
||||
|
||||
Generally, a cluster file should not be modified manually. Incorrect modifications after a cluster is created could result in data loss. To change the set of coordination servers used by a cluster, see :ref:`configuration-choosing-coordination-servers`. To change the cluster ``description``, see :ref:`configuration-setting-cluster-description`.
|
||||
|
||||
It is very important that each cluster use a unique random ID. If multiple processes use the same database description and ID but different sets of coordination servers, data corruption could result.
|
||||
|
||||
.. _cluster-file-client-access:
|
||||
|
||||
Accessing cluster file information from a client
|
||||
------------------------------------------------
|
||||
|
||||
Any client connected to FoundationDB can access information about its cluster file directly from the database:
|
||||
|
||||
* To get the path to the cluster file, read the key ``\xFF\xFF/cluster_file_path``.
|
||||
* To get the contents of the cluster file, read the key ``\xFF\xFF/connection_string``.
|
||||
|
||||
.. _adding-machines-to-a-cluster:
|
||||
|
||||
Adding machines to a cluster
|
||||
============================
|
||||
|
||||
.. warning:: |development-use-only-warning|
|
||||
|
||||
You can add new machines to a cluster at any time:
|
||||
|
||||
1) :doc:`Install FoundationDB <getting-started-linux>` on the new machine.
|
||||
|
||||
2) |optimize-configuration|
|
||||
|
||||
3) Copy an :ref:`existing cluster file <specifying-a-cluster-file>` from a server in your cluster to the new machine, overwriting the existing ``fdb.cluster`` file.
|
||||
|
||||
4) Restart FoundationDB on the new machine so that it uses the new cluster file::
|
||||
|
||||
user@host2$ sudo service foundationdb restart
|
||||
|
||||
5) If you have previously :ref:`excluded <removing-machines-from-a-cluster>` a machine from the cluster, you will need to take it off the exclusion list using the ``include <ip>`` command of fdbcli before it can be a full participant in the cluster.
|
||||
|
||||
.. note:: Addresses have the form ``IP``:``PORT``.
|
||||
|
||||
.. _removing-machines-from-a-cluster:
|
||||
|
||||
Removing machines from a cluster
|
||||
==================================
|
||||
|
||||
To temporarily or permanently remove one or more machines from a FoundationDB cluster without compromising fault tolerance or availability, perform the following steps:
|
||||
|
||||
1) Make sure that your current redundancy mode will still make sense after removing the machines you want to remove. For example, if you are currently using ``triple`` redundancy and are reducing the number of servers to fewer than five, you should probably switch to a lower redundancy mode first. See :ref:`configuration-choosing-redundancy-mode`.
|
||||
|
||||
2) If any of the machines that you would like to remove is a coordinator, you should :ref:`change coordination servers <configuration-changing-coordination-servers>` to a set of servers that you will not be removing. Remember that even after changing coordinators, the old coordinators need to remain available until all servers and clients of the cluster have automatically updated their cluster files.
|
||||
|
||||
3) Use the ``exclude`` command in ``fdbcli`` on the machines you plan to remove:
|
||||
|
||||
::
|
||||
|
||||
user@host1$ fdbcli
|
||||
Using cluster file `/etc/foundationdb/fdb.cluster'.
|
||||
|
||||
The database is available.
|
||||
|
||||
Welcome to the fdbcli. For help, type `help'.
|
||||
fdb> exclude 1.2.3.4 1.2.3.5 1.2.3.6
|
||||
Waiting for state to be removed from all excluded servers. This may take a while.
|
||||
It is now safe to remove these machines or processes from the cluster.
|
||||
|
||||
|
||||
``exclude`` can be used to exclude either machines (by specifiying an IP address) or individual processes (by specificying an ``IP``:``PORT`` pair).
|
||||
|
||||
.. note:: Addresses have the form ``IP``:``PORT``.
|
||||
|
||||
Excluding a server doesn't shut it down immediately; data on the machine is first moved away. When the ``exclude`` command completes successfully (by returning control to the command prompt), the machines that you specified are no longer required to maintain the configured redundancy mode. A large amount of data might need to be transferred first, so be patient. When the process is complete, the excluded machine or process can be shut down without fault tolerance or availability consequences.
|
||||
|
||||
If you interrupt the exclude command with Ctrl-C after seeing the "waiting for state to be removed" message, the exclusion work will continue in the background. Repeating the command will continue waiting for the exclusion to complete. To reverse the effect of the ``exclude`` command, use the ``include`` command.
|
||||
|
||||
4) On each removed machine, stop the FoundationDB server and prevent it from starting at the next boot. Follow the :ref:`instructions for your platform <administration-running-foundationdb>`. For example, on Ubuntu::
|
||||
|
||||
user@host3$ sudo service foundationdb stop
|
||||
user@host3$ sudo update-rc.d foundationdb disable
|
||||
|
||||
5) :ref:`test-the-database` to double check that everything went smoothly, paying particular attention to the replication health.
|
||||
|
||||
6) You can optionally :ref:`uninstall <administration-removing>` the FoundationDB server package entirely and/or delete database files on removed servers.
|
||||
|
||||
7) If you ever want to add a removed machine back to the cluster, you will have to take it off the excluded servers list to which it was added in step 3. This can be done using the ``include`` command of ``fdbcli``. Typing ``exclude`` with no parameters will tell you the current list of excluded machines.
|
||||
|
||||
Moving a cluster
|
||||
================
|
||||
|
||||
The procedures for adding and removing machines can be combined into a recipe for :doc:`moving an existing cluster to new machines <moving-a-cluster>`.
|
||||
|
||||
.. _administration-monitoring-cluster-status:
|
||||
|
||||
Monitoring cluster status
|
||||
=========================
|
||||
|
||||
Use the ``status`` command of ``fdbcli`` to determine if the cluster is up and running::
|
||||
|
||||
user@host$ fdbcli
|
||||
Using cluster file `/etc/foundationdb/fdb.cluster'.
|
||||
|
||||
The database is available.
|
||||
|
||||
Welcome to the fdbcli. For help, type `help'.
|
||||
fdb> status
|
||||
|
||||
The ``status`` command displays general information about the FoundationDB cluster::
|
||||
|
||||
Configuration:
|
||||
Redundancy mode - triple
|
||||
Storage engine - ssd
|
||||
Coordinators - 3
|
||||
|
||||
Cluster:
|
||||
FoundationDB processes - 3
|
||||
Machines - 3
|
||||
Memory availability - 4.1 GB per process on machine with least available
|
||||
Fault Tolerance - 0 machines
|
||||
Server time - Wed Oct 8 14:41:34 2014
|
||||
|
||||
Data:
|
||||
Replication health - Healthy
|
||||
Moving data - 0.000 GB
|
||||
Sum of key-value sizes - 0 MB
|
||||
|
||||
Operating space:
|
||||
Storage server - 1.0 GB free on most full server
|
||||
Log server - 1.0 GB free on most full server
|
||||
|
||||
Workload:
|
||||
Read rate - 2 Hz
|
||||
Write rate - 0 Hz
|
||||
Transactions started - 2 Hz
|
||||
Transactions committed - 0 Hz
|
||||
Conflict rate - 0 Hz
|
||||
|
||||
Client time: Thu Nov 20 09:50:45 2014
|
||||
|
||||
The summary fields are interpreted as follows:
|
||||
|
||||
====================== ==========================================================================================================
|
||||
Redundancy mode The currently configured redundancy mode (see the section :ref:`configuration-choosing-redundancy-mode`)
|
||||
Storage engine The currently configured storage engine (see the section :ref:`configuration-configuring-storage-subsystem`)
|
||||
Coordinators The number of FoundationDB coordination servers
|
||||
FoundationDB processes Number of FoundationDB processes participating in the cluster
|
||||
Machines Number of physical machines running at least one FoundationDB process that is participating in the cluster
|
||||
Memory availability RAM per process on machine with least available (see details below)
|
||||
Fault tolerance Maximum number of machines that can fail without losing data or availability (number for losing data will be reported separately if lower)
|
||||
Server time Timestamp from the server
|
||||
Replication health A qualitative estimate of the health of data replication
|
||||
Moving data Amount of data currently in movement between machines
|
||||
Sum of key-value sizes Estimated total size of keys and values stored (not including any overhead or replication)
|
||||
Storage server Free space for storage on the server with least available. For ``ssd`` storage engine, includes only disk; for ``memory`` storage engine, includes both RAM and disk.
|
||||
Log server Free space for log server on the server with least available.
|
||||
Read rate The current number of reads per second
|
||||
Write rate The current number of writes per second
|
||||
Transaction started The current number of transactions started per second
|
||||
Transaction committed The current number of transactions committed per second
|
||||
Conflict rate The current number of conflicts per second
|
||||
====================== ==========================================================================================================
|
||||
|
||||
The "Memory availability" is a conservative estimate of the minimal RAM available to any ``fdbserver`` process across all machines in the cluster. This value is calculated in two steps. Memory available per process is first calculated *for each machine* by taking:
|
||||
|
||||
availability = ((total - committed) + sum(processSize)) / processes
|
||||
|
||||
where:
|
||||
|
||||
=========== ==================================================
|
||||
total total RAM on the machine
|
||||
committed committed RAM on the machine
|
||||
processSize total physical memory used by a given ``fdbserver`` process
|
||||
processes number of ``fdbserver`` processes on the machine
|
||||
=========== ==================================================
|
||||
|
||||
The reported value is then the *minimum* of memory available per process *over all machines* in the cluster. If this value is below 4.0 GB, a warning message is added to the status report.
|
||||
|
||||
Process details
|
||||
---------------
|
||||
|
||||
The ``status`` command can provide detailed statistics about the cluster and the database by giving it the ``details`` argument::
|
||||
|
||||
user@host$ fdbcli
|
||||
Using cluster file `/etc/foundationdb/fdb.cluster'.
|
||||
|
||||
The database is available.
|
||||
|
||||
Welcome to the fdbcli. For help, type `help'.
|
||||
fdb> status details
|
||||
|
||||
Configuration:
|
||||
Redundancy mode - triple
|
||||
Storage engine - ssd
|
||||
Coordinators - 3
|
||||
|
||||
Cluster:
|
||||
FoundationDB processes - 3
|
||||
Machines - 3
|
||||
Memory availability - 4.1 GB per process on machine with least available
|
||||
Fault Tolerance - 0 machines
|
||||
Server time - Wed Oct 8 14:41:34 2014
|
||||
|
||||
Data:
|
||||
Replication health - Healthy
|
||||
Moving data - 0.000 GB
|
||||
Sum of key-value sizes - 0 MB
|
||||
|
||||
Operating space:
|
||||
Storage server - 1.0 GB free on most full server
|
||||
Log server - 1.0 GB free on most full server
|
||||
|
||||
Workload:
|
||||
Read rate - 2 Hz
|
||||
Write rate - 0 Hz
|
||||
Transactions started - 2 Hz
|
||||
Transactions committed - 0 Hz
|
||||
Conflict rate - 0 Hz
|
||||
|
||||
Process performance details:
|
||||
10.0.4.1:4500 ( 3% cpu; 2% machine; 0.004 Gbps; 0% disk; 2.5 GB / 4.1 GB RAM )
|
||||
10.0.4.2:4500 ( 1% cpu; 2% machine; 0.004 Gbps; 0% disk; 2.5 GB / 4.1 GB RAM )
|
||||
10.0.4.3:4500 ( 1% cpu; 2% machine; 0.004 Gbps; 0% disk; 2.5 GB / 4.1 GB RAM )
|
||||
|
||||
Coordination servers:
|
||||
10.0.4.1:4500
|
||||
10.0.4.2:4500
|
||||
10.0.4.3:4500
|
||||
|
||||
Client time: Thu Nov 20 09:56:52 2014
|
||||
|
||||
Several details about individual FoundationDB processes are displayed in a list format in parenthesis after the IP address and port:
|
||||
|
||||
======= =========================================================================
|
||||
cpu CPU utilization of the individual process
|
||||
machine CPU utilization of the machine the process is running on (over all cores)
|
||||
Gbps Total input + output network traffic, in Gbps
|
||||
disk Percentage busy time of the disk subsystem on which the data resides
|
||||
REXMIT! Displayed only if there have been more than 10 TCP segments retransmitted in last 5s
|
||||
RAM Total physical memory used by process / memory available per process
|
||||
======= =========================================================================
|
||||
|
||||
In certain cases, FoundationDB's overall performance can be negatively impacted by an individual slow or degraded computer or subsystem. If you suspect this is the case, this detailed list is helpful to find the culprit.
|
||||
|
||||
If a process has had more than 10 TCP segments retransmitted in the last 5 seconds, the warning message ``REXMIT!`` is displayed between its disk and RAM values, leading to an output under ``Process performance details`` of the form::
|
||||
|
||||
10.0.4.1:4500 ( 3% cpu; 2% machine; 0.004 Gbps; 0% disk; REXMIT! 2.5 GB / 4.1 GB RAM )
|
||||
|
||||
.. _administration_fdbmonitor:
|
||||
|
||||
``fdbmonitor`` and ``fdbserver``
|
||||
================================
|
||||
|
||||
The core FoundationDB server process is ``fdbserver``. Each ``fdbserver`` process uses up to one full CPU core, so a production FoundationDB cluster will usually run N such processes on an N-core system.
|
||||
|
||||
To make configuring, starting, stopping, and restarting ``fdbserver`` processes easy, FoundationDB also comes with a singleton daemon process, ``fdbmonitor``, which is started automatically on boot. ``fdbmonitor`` reads the :ref:`foundationdb.conf <foundationdb-conf>` file and starts the configured set of ``fdbserver`` processes. It is also responsible for starting ``backup-agent``.
|
||||
|
||||
.. note:: |conf-file-change-detection|
|
||||
|
||||
During normal operation, ``fdbmonitor`` is transparent, and you interact with it only by modifying the configuration in :ref:`foundationdb.conf <foundationdb-conf>` and perhaps occasionally by :ref:`starting and stopping <administration-running-foundationdb>` it manually. If some problem prevents an ``fdbserver`` or ``backup-agent`` process from starting or causes it to stop unexpectedly, ``fdbmonitor`` will log errors to the system log.
|
||||
|
||||
.. _administration-managing-trace-files:
|
||||
|
||||
Managing trace files
|
||||
====================
|
||||
|
||||
By default, trace files are output to:
|
||||
|
||||
* ``/var/log/foundationdb/`` on Linux
|
||||
* ``/usr/local/foundationdb/logs/`` on macOS
|
||||
|
||||
Trace files are rolled every 10MB. These files are valuable to the FoundationDB development team for diagnostic purposes, and should be retained in case you need support from FoundationDB. Old trace files are automatically deleted so that there are no more than 100 MB worth of trace files per process. Both the log size and the maximum total size of the log files are configurable on a per process basis in the :ref:`configuration file <foundationdb-conf>`.
|
||||
|
||||
.. _administration-disaster-recovery:
|
||||
|
||||
Disaster Recovery
|
||||
=================
|
||||
|
||||
In the present version of FoundationDB, disaster recovery (DR) is implemented via asynchronous replication of a source cluster to a destination cluster residing in another datacenter. The asynchronous replication updates the destination cluster using transactions consistent with those that have been committed in the source cluster. In this way, the replication process guarantees that the destination cluster is always in a consistent state that matches a present or earlier state of the source cluster.
|
||||
|
||||
Recovery takes place by reversing the asynchronous replication, so the data in the destination cluster is streamed back to a source cluster. For further information, see the :ref:`overview of backups <backup-introduction>` and the :ref:`fdbdr tool <fdbdr-intro>` that performs asynchronous replication.
|
||||
|
||||
.. _administration-other-administrative-concerns:
|
||||
|
||||
Other administrative concerns
|
||||
=============================
|
||||
|
||||
.. _storage-space-requirements:
|
||||
|
||||
Storage space requirements
|
||||
--------------------------
|
||||
|
||||
FoundationDB's storage space requirements depend on which storage engine is used.
|
||||
|
||||
Using the ``ssd`` storage engine, data is stored in B-trees that add some overhead.
|
||||
|
||||
* For key-value pairs larger than about 100 bytes, overhead should usually be less than 2x per replica. In a triple-replicated configuration, the raw capacity required might be 5x the size of the data. However, SSDs often require over-provisioning (e.g. keeping the drive less than 75% full) for best performance, so 10x would be a reasonable number. For example, 100GB of raw key-values would require 1TB of raw capacity.
|
||||
|
||||
* For very small key-value pairs, the overhead can be a large factor but not usually more than about 40 bytes per replica. Therefore, with triple replication and SSD over-provisioning, allowing 200 bytes of raw storage capacity for each very small key-value pair would be a reasonable guess. For example, 1 billion very small key-value pairs would require 200GB of raw storage.
|
||||
|
||||
Using the ``memory`` storage engine, both memory and disk space need to be considered.
|
||||
|
||||
* There is a fixed overhead of 72 bytes of memory for each key-value pair. Furthermore, memory is allocated in chunks whose sizes are powers of 2, leading to a variable padding overhead for each key-value pair. Finally, there is some overhead within memory chunks. For example, a 32 byte chunk has 6 bytes of overhead and therefore can only contain 26 bytes. As a result, a 27-byte key-value pair will be stored in a 64 byte chunk. The absolute amount of overhead within a chunk increases for larger chunks.
|
||||
|
||||
* Disk space usage is about 8x the original data size. The memory storage engine interleaves a snapshot on disk with a transaction log, with the resulting snapshot 2x the data size. A snapshot can't be dropped from its log until the next snapshot is completely written, so 2 snapshots must be kept at 4x the data size. The two-file durable queue can't overwrite data in one file until all the data in the other file has been dropped, resulting in 8x the data size. Finally, it should be noted that disk space is not reclaimed when key-value pairs are cleared.
|
||||
|
||||
Running out of storage space
|
||||
----------------------------
|
||||
|
||||
FoundationDB is aware of the free storage space on each node. It attempts to load all nodes equally so that no node runs out of space before the others. The database attempts to gracefully stop writes as storage space decreases to 100 MB, refusing to start new transactions with priorities other than ``SYSTEM_IMMEDIATE``. This lower bound on free space leaves space to allow you to use ``SYSTEM_IMMEDIATE`` transactions to remove data.
|
||||
|
||||
The measure of free space depends on the storage engine. For the memory storage engine, which is the default after installation, total space is limited to the lesser of the ``storage_memory`` configuration parameter (1 GB in the default configuration) or a fraction of the free disk space.
|
||||
|
||||
If the disk is rapidly filled by other programs, trace files, etc., FoundationDB may be forced to stop with significant amounts of queued writes. The only way to restore the availability of the database at this point is to manually free storage space by deleting files.
|
||||
|
||||
Virtual machines
|
||||
----------------
|
||||
|
||||
Processes running in different VMs on a single machine will appear to FoundationDB as being hardware isolated. FoundationDB takes pains to assure that data replication is protected from hardware-correlated failures. If FoundationDB is run in multiple VMs on a single machine this protection will be subverted. An administrator can inform FoundationDB of this hardware sharing, however, by specifying a machine ID using the ``machine_id`` parameter in :ref:`foundationdb.conf <foundationdb-conf>`. All processes on VMs that share hardware should specify the same ``machine_id``.
|
||||
|
||||
Datacenters
|
||||
------------
|
||||
|
||||
FoundationDB is datacenter aware and supports operation across datacenters. In a multiple-datacenter configuration, it is recommended that you set the :ref:`redundancy mode <configuration-choosing-redundancy-mode>` to ``three_datacenter`` and that you set the ``datacenter_id`` parameter for all FoundationDB processes in :ref:`foundationdb.conf <foundationdb-conf>`.
|
||||
|
||||
If you specify the ``-a`` option to any FoundationDB process in your cluster, you should specify it to all such processes. Processes which do not have a specified datacenter ID on the command line are considered part of a default "unset" datacenter. FoundationDB will incorrectly believe that these processes are failure-isolated from other datacenters, which can reduce performance and fault tolerance.
|
||||
|
||||
.. _administration-removing:
|
||||
|
||||
Uninstalling
|
||||
============
|
||||
|
||||
To uninstall FoundationDB from a cluster of one or more machines:
|
||||
|
||||
1. Uninstall the packages on each machine in the cluster.
|
||||
|
||||
* On Ubuntu use::
|
||||
|
||||
user@host$ sudo dpkg -P foundationdb-clients foundationdb-server
|
||||
|
||||
* On RHEL/CentOS use::
|
||||
|
||||
user@host$ sudo rpm -e foundationdb-clients foundationdb-server
|
||||
|
||||
* On macOS use::
|
||||
|
||||
host:~ user$ sudo /usr/local/foundationdb/uninstall-FoundationDB.sh
|
||||
|
||||
2. Delete all the data and configuration files stored by FoundationDB.
|
||||
|
||||
* On Linux these will be in ``/var/lib/foundationdb/``, ``/var/log/foundationdb/``, and ``/etc/foundationdb/`` by default.
|
||||
* On macOS these will be in ``/usr/local/foundationdb/`` and ``/usr/local/etc/foundationdb/`` by default.
|
||||
|
||||
.. _upgrading-foundationdb:
|
||||
|
||||
Upgrading
|
||||
=========
|
||||
|
||||
When a FoundationDB package is installed on a machine that already has a previous version, the package will upgrade FoundationDB to the newer version. For recent versions, the upgrade will preserve all previous data and configuration settings. (See the :ref:`notes on specific versions <version-specific-upgrading>` for exceptions.)
|
||||
|
||||
To upgrade a FoundationDB cluster, you must install the updated version of FoundationDB on each machine in the cluster. As the installations are taking place, the cluster will become unavailable until a sufficient number of machines have been upgraded. By following the steps below, you can perform a production upgrade with minimal downtime (seconds to minutes) and maintain all database guarantees. The instructions below assume that Linux packages are being used.
|
||||
|
||||
.. warning:: |development-use-only-warning|
|
||||
|
||||
Stage the packages
|
||||
------------------
|
||||
|
||||
Go to :doc:`downloads` and select Ubuntu or RHEL/CentOS, as appropriate for your system. Download both the client and server packages and copy them to each machine in your cluster.
|
||||
|
||||
.. warning:: |upgrade-client-server-warning|
|
||||
|
||||
Perform the upgrade
|
||||
-------------------
|
||||
|
||||
For **Ubuntu**, perform the upgrade using the dpkg command:
|
||||
|
||||
.. parsed-literal::
|
||||
|
||||
user@host$ sudo dpkg -i |package-deb-clients| \\
|
||||
|package-deb-server|
|
||||
|
||||
For **RHEL/CentOS**, perform the upgrade using the rpm command:
|
||||
|
||||
.. parsed-literal::
|
||||
|
||||
user@host$ sudo rpm -Uvh |package-rpm-clients| \\
|
||||
|package-rpm-server|
|
||||
|
||||
The ``foundationdb-clients`` package also installs the :doc:`Python <api-python>` and :doc:`C <api-c>` APIs. If your clients use :doc:`Ruby <api-ruby>`, `Java <javadoc/index.html>`_, :doc:`Node.js <api-node>`, or `Go <godoc/fdb.html>`_, follow the instructions in the corresponding language documentation to install the APIs.
|
||||
|
||||
Test the database
|
||||
-----------------
|
||||
|
||||
Test the database to verify that it is operating normally by running ``fdbcli`` and :ref:`reviewing the cluster status <administration-monitoring-cluster-status>`.
|
||||
|
||||
Restart your application clients
|
||||
--------------------------------
|
||||
|
||||
Stop and restart all application clients to reload the upgraded FoundationDB dynamic libraries.
|
||||
|
||||
.. _version-specific-upgrading:
|
||||
|
||||
Version-specific notes on upgrading
|
||||
===================================
|
||||
|
||||
Upgrading from 5.0.x
|
||||
--------------------
|
||||
|
||||
Upgrades from 5.0.x will keep all your old data and configuration settings.
|
||||
|
||||
Upgrading from 4.6.x
|
||||
--------------------
|
||||
|
||||
Upgrades from 4.6.x will keep all your old data and configuration settings.
|
||||
|
||||
Upgrading from 4.5.x
|
||||
--------------------
|
||||
|
||||
Upgrades from 4.5.x will keep all your old data and configuration settings.
|
||||
|
||||
Upgrading from 4.4.x
|
||||
--------------------
|
||||
|
||||
Upgrades from 4.4.x will keep all your old data and configuration settings.
|
||||
|
||||
Upgrading from 4.3.x
|
||||
--------------------
|
||||
|
||||
Backup and DR must be stopped before upgrading. Upgrades from 4.3.x will keep all your old data and configuration settings.
|
||||
|
||||
Upgrading from 4.2.x
|
||||
--------------------
|
||||
|
||||
Backup and DR must be stopped before upgrading. Upgrades from 4.2.x will keep all your old data and configuration settings.
|
||||
|
||||
Upgrading from 4.1.x
|
||||
--------------------
|
||||
|
||||
Backup and DR must be stopped before upgrading. Upgrades from 4.1.x will keep all your old data and configuration settings.
|
||||
|
||||
Upgrading from 4.0.x
|
||||
--------------------
|
||||
|
||||
Backup and DR must be stopped before upgrading. Upgrades from 4.0.x will keep all your old data and configuration settings.
|
||||
|
||||
Upgrading from 3.x
|
||||
--------------------
|
||||
|
||||
To upgrade from versions prior to 4.0, you should first upgrade to 4.0 and then to the current version.
|
||||
|
||||
.. _upgrading-from-older-versions:
|
||||
|
||||
Upgrading from Older Versions
|
||||
-----------------------------
|
||||
|
||||
Upgrades from versions older than 3.0.0 are no longer supported. To upgrade from an older version, first upgrade to 4.0.x, then upgrade to the desired version.
|
|
@ -0,0 +1,53 @@
|
|||
#############
|
||||
Anti-Features
|
||||
#############
|
||||
|
||||
What is an anti-feature?
|
||||
========================
|
||||
|
||||
FoundationDB's design clearly distinguishes its set of :doc:`core features <features>` from those that are better supported elsewhere, often as :doc:`layers <layer-concept>`. FoundationDB has deliberately limited its database core API to the minimal set of features needed to expose a scalable, fault-tolerant database with ACID transactions and high performance. As a result, there is also a set of *anti-features:* features that FoundationDB does not intend to provide in its database core. (For a detailed list of technical limitations, whether relating to our design or just to the current version, see :doc:`known-limitations`).
|
||||
|
||||
Data models
|
||||
===========
|
||||
|
||||
With the rise of NoSQL databases, a wide variety of data models have become popular. Geospatial models are used for geographical data, JSON and XML for hierarchical documents, and column-family models for sparse, tabular data. Of course, the traditional relational model is also still widely used. Each of these data models is useful and appropriate for different use cases. An application that is even moderately complex may have distinct data sets that best fit different models.
|
||||
|
||||
FoundationDB's core exposes a single data model: an ordered key-value store. When combined with multikey, ACID transactions, this model is powerful enough to be used directly by applications, as illustrated in our :doc:`tutorial examples <tutorials>`. However, our ordered key-value store can also serve as a foundation for other data models, including those above. Because of the simplicity of the key-value model, it is easy to map higher-level models onto it. (See :doc:`data-modeling` for examples of these mappings).
|
||||
|
||||
Transactions are the essential capability that allows a developer to build a data model reliably and efficiently in a layer. If a higher-level model requires multiple key-value pairs per data item, a layer can update them all in a single transaction, ensuring their consistency.
|
||||
|
||||
Query languages
|
||||
===============
|
||||
|
||||
A variety of query languages have come into use alongside NoSQL databases. For example, new query languages specific to JSON databases have become popular. XQuery is used with XML. Languages with SQL-like syntax are often defined for column-family databases. Of course, SQL itself is still heavily used with relational databases.
|
||||
|
||||
The FoundationDB core exposes a robust and powerful API but includes no separate query language. FoundationDB empowers developers to employ a broad range of data models and use the query languages best suited to their applications, implemented as layers.
|
||||
|
||||
Analytic frameworks
|
||||
===================
|
||||
|
||||
Analytic processing of large data sets has become prominent in conjunction with NoSQL databases. Some popular approaches offer generic frameworks for batch processing (e.g, MapReduce); some offer frameworks for real-time, stream processing; some focus on more specific analytics, whether using traditional aggregation functions or statistical techniques such as machine learning. The most effective approach to analytics depends on the details of the application's data model.
|
||||
|
||||
Analytic frameworks are outside the scope of the FoundationDB core. However, because the core can use range reads to efficiently scan large swaths of data, analytics can be implemented within a layer, possibly as part of a query language.
|
||||
|
||||
Disconnected operation
|
||||
======================
|
||||
|
||||
The rise of mobile computing has led to the model of *disconnected operation* in which an application on a mobile device remains available even when it's not connected to a central server. Examples of this sort of mobile application include note taking, to-do lists, and document editing.
|
||||
|
||||
While a central server running FoundationDB could be used as a database for a mobile application to connect and sync to from time to time, FoundationDB's core does not itself directly provide disconnected operation. Because it would sacrifice ACID properties, we believe that in those applications where disconnected operation is needed, the database is the wrong tier to implement it.
|
||||
|
||||
Long-running transactions
|
||||
=========================
|
||||
|
||||
FoundationDB aims to provide low latencies across a range of metrics. Transaction latencies, in particular, are typically under 15 milliseconds. Some applications require very large operations that require several seconds or more, several orders of magnitude larger than our usual transaction latency. Large operations of this kind are best approached in FoundationDB by decomposition into a set of smaller transactions.
|
||||
|
||||
FoundationDB does not support *long-running transactions*, currently defined as those
|
||||
:ref:`lasting over five seconds <long-transactions>`. The system employs multiversion concurrency control and maintains older versions of the database for a five second period. A transaction that is kept open longer will not be able to commit. If you have a requirement to support large operations, we would be happy to assist you to implement a decomposition strategy within a layer.
|
||||
|
||||
Content delivery networks (CDN)
|
||||
===============================
|
||||
|
||||
A *content delivery network* (CDN) employs geographically dispersed datacenters to serve data with high performance to similarly dispersed end-users. While FoundationDB does support multiple datacenters, it has not been designed as a CDN. The FoundationDB core does not locate data in a geographically aware manner and does not aim to provide low write latencies (e.g., under 5 milliseconds) over large geographic distances.
|
||||
|
||||
In FoundationDB's configuration for multiple datacenters, each datacenter contains a complete, up-to-date copy of the database. Each client will have a primary datacenter, with other datacenters acting in a secondary mode to support minimal downtime if a datacenter becomes unavailable.
|
|
@ -0,0 +1,899 @@
|
|||
.. default-domain:: c
|
||||
.. highlight:: c
|
||||
|
||||
.. Required substitutions for api-common.rst.inc
|
||||
|
||||
.. |database-type| replace:: ``FDBDatabase``
|
||||
.. |database-class| replace:: :type:`FDBDatabase`
|
||||
.. |database-auto| replace:: FIXME
|
||||
.. |transaction-class| replace:: FIXME
|
||||
.. |get-key-func| replace:: :func:`fdb_transaction_get_key()`
|
||||
.. |get-range-func| replace:: :func:`fdb_transaction_get_range()`
|
||||
.. |commit-func| replace:: :func:`fdb_transaction_commit()`
|
||||
.. |reset-func-name| replace:: :func:`reset <fdb_transaction_reset()>`
|
||||
.. |reset-func| replace:: :func:`fdb_transaction_reset()`
|
||||
.. |cancel-func| replace:: :func:`fdb_transaction_cancel()`
|
||||
.. |init-func| replace:: FIXME
|
||||
.. |open-func| replace:: FIXME
|
||||
.. |set-cluster-file-func| replace:: FIXME
|
||||
.. |set-local-address-func| replace:: FIXME
|
||||
.. |on-error-func| replace:: :func:`fdb_transaction_on_error()`
|
||||
.. |null-type| replace:: FIXME
|
||||
.. |error-type| replace:: error
|
||||
.. |error-raise-type| replace:: return
|
||||
.. |future-cancel| replace:: :func:`fdb_future_cancel()`
|
||||
.. |max-watches-database-option| replace:: the MAX_WATCHES :func:`database option <fdb_database_set_option>`
|
||||
.. |future-type-string| replace:: an :type:`FDBFuture` object
|
||||
.. |read-your-writes-disable-option| replace:: the READ_YOUR_WRITES_DISABLE :func:`transaction option <fdb_transaction_set_option>`
|
||||
.. |lazy-iterator-object| replace:: FIXME
|
||||
.. |key-meth| replace:: FIXME
|
||||
.. |directory-subspace| replace:: FIXME
|
||||
.. |directory-layer| replace:: FIXME
|
||||
.. |subspace| replace:: FIXME
|
||||
.. |subspace-api| replace:: FIXME
|
||||
.. |as-foundationdb-key| replace:: FIXME
|
||||
.. |as-foundationdb-value| replace:: FIXME
|
||||
.. |tuple-layer| replace:: FIXME
|
||||
.. |dir-path-type| replace:: FIXME
|
||||
.. |node-subspace| replace:: FIXME
|
||||
.. |content-subspace| replace:: FIXME
|
||||
.. |allow-manual-prefixes| replace:: FIXME
|
||||
|
||||
.. include:: api-common.rst.inc
|
||||
|
||||
.. |future-warning| replace:: :data:`future` must represent a result of the appropriate type (i.e. must have been returned by a function documented as returning this type), or the results are undefined.
|
||||
|
||||
.. |future-get-return1| replace:: Returns zero if :data:`future` is ready and not in an error state, and a non-zero :ref:`error code <developer-guide-error-codes>` otherwise
|
||||
|
||||
.. |future-get-return2| replace:: (in which case the value of any out parameter is undefined)
|
||||
|
||||
.. |future-memory-mine| replace:: The memory referenced by the result is owned by the :type:`FDBFuture` object and will be valid until either ``fdb_future_destroy(future)`` or ``fdb_future_release_memory(future)`` is called.
|
||||
|
||||
.. |future-memory-yours1| replace:: This function may only be called once on a given :type:`FDBFuture` object, as it transfers ownership of the
|
||||
|
||||
.. |future-memory-yours2| replace:: to the caller. The caller is responsible for calling
|
||||
|
||||
.. |future-memory-yours3| replace:: when finished with the result.
|
||||
|
||||
.. |future-return0| replace:: Returns an :type:`FDBFuture` which will be set to
|
||||
|
||||
.. |future-return1| replace:: You must first wait for the :type:`FDBFuture` to be ready, check for errors,
|
||||
|
||||
.. |future-return2| replace:: and then destroy the :type:`FDBFuture` with :func:`fdb_future_destroy()`.
|
||||
|
||||
.. |future-returnvoid0| replace:: Returns an :type:`FDBFuture` representing an empty value
|
||||
|
||||
.. |future-returnvoid| replace:: |future-returnvoid0|. |future-return1| |future-return2|
|
||||
|
||||
.. |option-doc| replace:: Please see ``fdb_c_options.g.h`` for a definition of this type, along with documentation of its allowed values.
|
||||
|
||||
.. |option-parameter| replace:: If the given option is documented as taking a parameter, you must also pass a pointer to the parameter value and the parameter value's length. If the option is documented as taking an ``Int`` parameter, ``value`` must point to a signed 64-bit integer (little-endian), and ``value_length`` must be 8. This memory only needs to be valid until
|
||||
|
||||
.. |no-null| replace:: The value does not need to be NULL-terminated.
|
||||
|
||||
.. |length-of| replace:: The length of the parameter specified by
|
||||
|
||||
.. |snapshot| replace:: Non-zero if this is a :ref:`snapshot read <snapshots>`.
|
||||
|
||||
.. |sets-and-clears1| replace:: Modify the database snapshot represented by :data:`transaction`
|
||||
|
||||
.. |sets-and-clears2| replace:: The modification affects the actual database only if :data:`transaction` is later committed with :func:`fdb_transaction_commit()`.
|
||||
|
||||
=====
|
||||
C API
|
||||
=====
|
||||
|
||||
This API provides a very low-level interface to FoundationDB. It is primarily intended for use in implementing higher level APIs, rather than for direct use. If you are new to FoundationDB, you are probably better served by reading one of the other APIs first.
|
||||
|
||||
Installation
|
||||
============
|
||||
|
||||
FoundationDB's C bindings are installed with the FoundationDB client binaries (see :ref:`installing-client-binaries`).
|
||||
|
||||
On Linux,
|
||||
| ``fdb_c.h`` is installed into ``/usr/include/foundationdb/``
|
||||
| ``libfdb_c.so`` is installed into ``/usr/lib/``
|
||||
|
||||
On macOS,
|
||||
| ``fdb_c.h`` is installed into ``/usr/local/include/foundationdb/``
|
||||
| ``libfdb_c.dylib`` is installed into ``/usr/local/lib/``
|
||||
|
||||
Linking
|
||||
=======
|
||||
|
||||
The FoundationDB C bindings are provided as a shared object which may be linked against at build time, or dynamically loaded at runtime. Any program that uses this API must be able to find a platform-appropriate shared library at runtime. Generally, this condition is best met by installing the FoundationDB client binaries (see :ref:`installing-client-binaries`) on any machine where the program will be run.
|
||||
|
||||
Linux
|
||||
-----
|
||||
|
||||
When linking against ``libfdb_c.so``, you must also link against ``libm``, ``libpthread`` and ``librt``. These dependencies will be resolved by the dynamic linker when using this API via :func:`dlopen()` or an FFI.
|
||||
|
||||
macOS
|
||||
--------
|
||||
|
||||
When linking against ``libfdb_c.dylib``, no additional libraries are required.
|
||||
|
||||
API versioning
|
||||
==============
|
||||
|
||||
Prior to including ``fdb_c.h``, you must define the :macro:`FDB_API_VERSION` macro. This, together with the :func:`fdb_select_api_version()` function, allows programs written against an older version of the API to compile and run with newer versions of the C library. The current version of the FoundationDB C API is |api-version|. ::
|
||||
|
||||
#define FDB_API_VERSION 510
|
||||
#include <foundationdb/fdb_c.h>
|
||||
|
||||
.. function:: fdb_error_t fdb_select_api_version(int version)
|
||||
|
||||
Must be called before any other API functions. :data:`version` must be less than or equal to :macro:`FDB_API_VERSION` (and should almost always be equal).
|
||||
|
||||
Language bindings implemented in C which themselves expose API versioning will usually pass the version requested by the application, instead of always passing :macro:`FDB_API_VERSION`.
|
||||
|
||||
Passing a version less than :macro:`FDB_API_VERSION` will cause the API to behave as it did in the older version.
|
||||
|
||||
It is an error to call this function after it has returned successfully. It is not thread safe, and if called from more than one thread simultaneously its behavior is undefined.
|
||||
|
||||
.. note:: This is actually implemented as a macro. If you are accessing this API via :func:`dlopen()` or an FFI, you will need to use :func:`fdb_select_api_version_impl()`.
|
||||
|
||||
.. warning:: |api-version-multi-version-warning|
|
||||
|
||||
.. function:: fdb_error_t fdb_select_api_version_impl(int runtime_version, int header_version)
|
||||
|
||||
This is the actual entry point called by the :func:`fdb_select_api_version` macro. It should never be called directly from C, but if you are accessing this API via :func:`dlopen()` or an FFI, you will need to use it. ``fdb_select_api_version(v)`` is equivalent to ``fdb_select_api_version_impl(v, FDB_API_VERSION)``.
|
||||
|
||||
It is an error to call this function after it has returned successfully. It is not thread safe, and if called from more than one thread simultaneously its behavior is undefined.
|
||||
|
||||
:data:`runtime_version`
|
||||
The version of run-time behavior the API is requested to provide. Must be less than or equal to :data:`header_version`, and should almost always be equal.
|
||||
|
||||
Language bindings which themselves expose API versioning will usually pass the version requested by the application.
|
||||
|
||||
:data:`header_version`
|
||||
The version of the ABI (application binary interface) that the calling code expects to find in the shared library. If you are using an FFI, this *must* correspond to the version of the API you are using as a reference (currently |api-version|). For example, the number of arguments that a function takes may be affected by this value, and an incorrect value is unlikely to yield success.
|
||||
|
||||
.. warning:: |api-version-multi-version-warning|
|
||||
|
||||
.. function:: int fdb_get_max_api_version()
|
||||
|
||||
Returns ``FDB_API_VERSION``, the current version of the FoundationDB C API. This is the maximum version that may be passed to :func:`fdb_select_api_version()`.
|
||||
|
||||
Network
|
||||
=======
|
||||
|
||||
The FoundationDB client library performs most tasks on a singleton thread (which usually will be a different thread than your application runs on). These functions are used to configure, start and stop the FoundationDB event loop on this thread.
|
||||
|
||||
.. function:: fdb_error_t fdb_network_set_option(FDBNetworkOption option, uint8_t const* value, int value_length)
|
||||
|
||||
Called to set network options. |option-parameter| :func:`fdb_network_set_option()` returns.
|
||||
|
||||
.. type:: FDBNetworkOption
|
||||
|
||||
|option-doc|
|
||||
|
||||
.. function:: fdb_error_t fdb_setup_network()
|
||||
|
||||
Must be called after :func:`fdb_select_api_version()` (and zero or more calls to :func:`fdb_network_set_option()`) and before any other function in this API. :func:`fdb_setup_network()` can only be called once.
|
||||
|
||||
.. function:: fdb_error_t fdb_add_network_thread_completion_hook(void (*hook)(void*), void *hook_parameter)
|
||||
|
||||
Must be called after :func:`fdb_setup_network()` and prior to :func:`fdb_run_network()` if called at all. This will register the given callback to run at the completion of the network thread. If there are multiple network threads running (which might occur if one is running multiple versions of the client, for example), then the callback is invoked once on each thread. When the supplied function is called, the supplied parameter is passed to it.
|
||||
|
||||
.. function:: fdb_error_t fdb_run_network()
|
||||
|
||||
Must be called after :func:`fdb_setup_network()` before any asynchronous functions in this API can be expected to complete. Unless your program is entirely event-driven based on results of asynchronous functions in this API and has no event loop of its own, you will want to invoke this function on an auxiliary thread (which it is your responsibility to create).
|
||||
|
||||
This function will not return until :func:`fdb_stop_network()` is called by you or a serious error occurs. You must not invoke :func:`fdb_run_network()` concurrently or reentrantly while it is already running.
|
||||
|
||||
.. function:: fdb_error_t fdb_stop_network()
|
||||
|
||||
Signals the event loop invoked by :func:`fdb_run_network()` to terminate. You must call this function **and wait for** :func:`fdb_run_network()` **to return** before allowing your program to exit, or else the behavior is undefined. For example, when running :func:`fdb_run_network()` on a thread (using pthread), this will look like::
|
||||
|
||||
pthread_t network_thread; /* handle for thread which invoked fdb_run_network() */
|
||||
int err;
|
||||
|
||||
...
|
||||
|
||||
err = fdb_stop_network();
|
||||
if ( err ) {
|
||||
/* An error occurred (probably network not running) */
|
||||
}
|
||||
err = pthread_join( network_thread, NULL );
|
||||
if ( err ) {
|
||||
/* Unknown error */
|
||||
}
|
||||
exit(0);
|
||||
|
||||
This function may be called from any thread. |network-cannot-be-restarted-blurb|
|
||||
|
||||
Future
|
||||
======
|
||||
|
||||
Most functions in the FoundationDB API are asynchronous, meaning that they may return to the caller before actually delivering their result. These functions always return :type:`FDBFuture*`. An :type:`FDBFuture` object represents a result value or error to be delivered at some future time. You can wait for a Future to be "ready" -- to have a value or error delivered -- by setting a callback function, or by blocking a thread, or by polling. Once a Future is ready, you can extract either an error code or a value of the appropriate type (the documentation for the original function will tell you which :func:`fdb_future_get_*()` function you should call).
|
||||
|
||||
To use the API in a synchronous way, you would typically do something like this for each asynchronous call::
|
||||
|
||||
// Call an API that returns FDBFuture*, documented as returning type foo in the future
|
||||
f = fdb_something();
|
||||
|
||||
// Wait for the Future to be *ready*
|
||||
if ( (fdb_future_block_until_ready(f)) != 0 ) {
|
||||
// Exceptional error (e.g. out of memory)
|
||||
}
|
||||
|
||||
if ( (err = fdb_future_get_foo(f, &result)) == 0 ) {
|
||||
// Use result
|
||||
// In some cases, you must be finished with result before calling
|
||||
// fdb_future_destroy() (see the documentation for the specific
|
||||
// fdb_future_get_*() method)
|
||||
} else {
|
||||
// Handle the error. If this is an error in a transaction, see
|
||||
// fdb_transaction_on_error()
|
||||
}
|
||||
|
||||
fdb_future_destroy(f);
|
||||
|
||||
Futures make it easy to do multiple operations in parallel, by calling several asynchronous functions before waiting for any of the results. This can be important for reducing the latency of transactions.
|
||||
|
||||
See :ref:`developer-guide-programming-with-futures` for further (language-independent) discussion.
|
||||
|
||||
.. type:: FDBFuture
|
||||
|
||||
An opaque type that represents a Future in the FoundationDB C API.
|
||||
|
||||
.. function:: void fdb_future_cancel(FDBFuture* future)
|
||||
|
||||
|future-cancel-blurb|
|
||||
|
||||
.. function:: void fdb_future_destroy(FDBFuture* future)
|
||||
|
||||
Destroys an :type:`FDBFuture` object. It must be called exactly once for each FDBFuture* returned by an API function. It may be called before or after the future is ready. It will also cancel the future (and its associated operation if the latter is still outstanding).
|
||||
|
||||
.. function:: fdb_error_t fdb_future_block_until_ready(FDBFuture* future)
|
||||
|
||||
Blocks the calling thread until the given Future is ready. It will return success even if the Future is set to an error -- you must call :func:`fdb_future_get_error()` to determine that. :func:`fdb_future_block_until_ready()` will return an error only in exceptional conditions (e.g. out of memory or other operating system resources).
|
||||
|
||||
.. warning:: Never call this function from a callback passed to :func:`fdb_future_set_callback()`. This may block the thread on which :func:`fdb_run_network()` was invoked, resulting in a deadlock.
|
||||
|
||||
.. function:: fdb_bool_t fdb_future_is_ready(FDBFuture* future)
|
||||
|
||||
Returns non-zero if the Future is ready. A Future is ready if it has been set to a value or an error.
|
||||
|
||||
.. function:: fdb_error_t fdb_future_set_callback(FDBFuture* future, FDBCallback callback, void* callback_parameter)
|
||||
|
||||
Causes the :type:`FDBCallback` function to be invoked as ``callback(future, callback_parameter)`` when the given Future is ready. If the Future is already ready, the call may occur in the current thread before this function returns (but this behavior is not guaranteed). Alternatively, the call may be delayed indefinitely and take place on the thread on which :func:`fdb_run_network()` was invoked, and the callback is responsible for any necessary thread synchronization (and/or for posting work back to your application event loop, thread pool, etc. if your application's architecture calls for that).
|
||||
|
||||
.. warning:: Never call :func:`fdb_future_block_until_ready()` from a callback passed to this function. This may block the thread on which :func:`fdb_run_network()` was invoked, resulting in a deadlock.
|
||||
|
||||
.. type:: FDBCallback
|
||||
|
||||
A pointer to a function which takes :type:`FDBFuture*` and :type:`void*` and returns :type:`void`.
|
||||
|
||||
.. function:: void fdb_future_release_memory(FDBFuture* future)
|
||||
|
||||
.. note:: This function provides no benefit to most application code. It is designed for use in writing generic, thread-safe language bindings. Applications should normally call :func:`fdb_future_destroy` only.
|
||||
|
||||
This function may only be called after a successful (zero return value) call to :func:`fdb_future_get_key`, :func:`fdb_future_get_value`, or :func:`fdb_future_get_keyvalue_array`. It indicates that the memory returned by the prior get call is no longer needed by the application. After this function has been called the same number of times as ``fdb_future_get_*()``, further calls to ``fdb_future_get_*()`` will return a :ref:`future_released <developer-guide-error-codes>` error. It is still necessary to later destroy the future with :func:`fdb_future_destroy`.
|
||||
|
||||
Calling this function is optional, since :func:`fdb_future_destroy` will also release the memory returned by get functions. However, :func:`fdb_future_release_memory` leaves the future object itself intact and provides a specific error code which can be used for coordination by multiple threads racing to do something with the results of a specific future. This has proven helpful in writing binding code.
|
||||
|
||||
.. function:: fdb_error_t fdb_future_get_error(FDBFuture* future)
|
||||
|
||||
|future-get-return1|.
|
||||
|
||||
.. function:: fdb_error_t fdb_future_get_version(FDBFuture* future, int64_t* out_version)
|
||||
|
||||
Extracts a value of type version from an :type:`FDBFuture` into a caller-provided variable of type :type:`int64_t`. |future-warning|
|
||||
|
||||
|future-get-return1| |future-get-return2|.
|
||||
|
||||
.. function:: fdb_error_t fdb_future_get_key(FDBFuture* future, uint8_t const** out_key, int* out_key_length)
|
||||
|
||||
Extracts a value of type key from an :type:`FDBFuture` into caller-provided variables of type :type:`uint8_t*` (a pointer to the beginning of the key) and :type:`int` (the length of the key). |future-warning|
|
||||
|
||||
|future-get-return1| |future-get-return2|.
|
||||
|
||||
|future-memory-mine|
|
||||
|
||||
.. function:: fdb_error_t fdb_future_get_cluster(FDBFuture* future, FDBCluster** out_cluster)
|
||||
|
||||
Extracts a value of type :type:`FDBCluster*` from an :type:`FDBFuture` into a caller-provided variable. |future-warning|
|
||||
|
||||
|future-get-return1| |future-get-return2|.
|
||||
|
||||
|future-memory-yours1| :type:`FDBCluster` |future-memory-yours2| :func:`fdb_cluster_destroy()` |future-memory-yours3|
|
||||
|
||||
.. function:: fdb_error_t fdb_future_get_database(FDBFuture* future, FDBDatabase** out_database)
|
||||
|
||||
Extracts a value of type :type:`FDBDatabase*` from an :type:`FDBFuture` into a caller-provided variable. |future-warning|
|
||||
|
||||
|future-get-return1| |future-get-return2|.
|
||||
|
||||
|future-memory-yours1| :type:`FDBDatabase` |future-memory-yours2| ``fdb_database_destroy(*out_database)`` |future-memory-yours3|
|
||||
|
||||
.. function:: fdb_error_t fdb_future_get_value(FDBFuture* future, fdb_bool_t* out_present, uint8_t const** out_value, int* out_value_length)
|
||||
|
||||
Extracts a database value from an :type:`FDBFuture` into caller-provided variables. |future-warning|
|
||||
|
||||
|future-get-return1| |future-get-return2|.
|
||||
|
||||
:data:`*out_present`
|
||||
Set to non-zero if (and only if) the requested value was present in the database. (If zero, the other outputs are meaningless.)
|
||||
|
||||
:data:`*out_value`
|
||||
Set to point to the first byte of the value.
|
||||
|
||||
:data:`*out_value_length`
|
||||
Set to the length of the value (in bytes).
|
||||
|
||||
|future-memory-mine|
|
||||
|
||||
.. function:: fdb_error_t fdb_future_get_string_array(FDBFuture* future, const char*** out_strings, int* out_count)
|
||||
|
||||
Extracts an array of null-terminated C strings from an :type:`FDBFuture` into caller-provided variables. |future-warning|
|
||||
|
||||
|future-get-return1| |future-get-return2|.
|
||||
|
||||
:data:`*out_strings`
|
||||
Set to point to the first string in the array.
|
||||
|
||||
:data:`*out_count`
|
||||
Set to the number of strings in the array.
|
||||
|
||||
|future-memory-mine|
|
||||
|
||||
.. function:: fdb_error_t fdb_future_get_keyvalue_array(FDBFuture* future, FDBKeyValue const** out_kv, int* out_count, fdb_bool_t* out_more)
|
||||
|
||||
Extracts an array of :type:`FDBKeyValue` objects from an :type:`FDBFuture` into caller-provided variables. |future-warning|
|
||||
|
||||
|future-get-return1| |future-get-return2|.
|
||||
|
||||
:data:`*out_kv`
|
||||
Set to point to the first :type:`FDBKeyValue` object in the array.
|
||||
|
||||
:data:`*out_count`
|
||||
Set to the number of :type:`FDBKeyValue` objects in the array.
|
||||
|
||||
:data:`*out_more`
|
||||
Set to true if (but not necessarily only if) values remain in the *key* range requested (possibly beyond the limits requested).
|
||||
|
||||
|future-memory-mine|
|
||||
|
||||
.. type:: FDBKeyValue
|
||||
|
||||
Represents a single key-value pair in the output of :func:`fdb_future_get_keyvalue_array`. ::
|
||||
|
||||
typedef struct {
|
||||
const void* key;
|
||||
int key_length;
|
||||
const void* value;
|
||||
int value_length;
|
||||
} FDBKeyValue;
|
||||
|
||||
:data:`key`
|
||||
A pointer to a key.
|
||||
|
||||
:data:`key_length`
|
||||
The length of the key pointed to by :data:`key`.
|
||||
|
||||
:data:`value`
|
||||
A pointer to a value.
|
||||
|
||||
:data:`value_length`
|
||||
The length of the value pointed to by :data:`value`.
|
||||
|
||||
Cluster
|
||||
=======
|
||||
|
||||
.. type:: FDBCluster
|
||||
|
||||
An opaque type that represents a Cluster in the FoundationDB C API.
|
||||
|
||||
.. function:: FDBFuture* fdb_create_cluster(const char* cluster_file_path)
|
||||
|
||||
|future-return0| an :type:`FDBCluster` object. |future-return1| call :func:`fdb_future_get_cluster()` to extract the :type:`FDBCluster` object, |future-return2|
|
||||
|
||||
:data:`cluster_file_path`
|
||||
A NULL-terminated string giving a local path of a :ref:`cluster file <foundationdb-cluster-file>` (often called 'fdb.cluster') which contains connection information for the FoundationDB cluster. If cluster_file_path is NULL or an empty string, then a :ref:`default cluster file <default-cluster-file>` will be used.
|
||||
|
||||
.. function:: void fdb_cluster_destroy(FDBCluster* cluster)
|
||||
|
||||
Destroys an :type:`FDBCluster` object. It must be called exactly once for each successful call to :func:`fdb_future_get_cluster()`. This function only destroys a handle to the cluster -- your cluster will be fine!
|
||||
|
||||
.. function:: fdb_error_t fdb_cluster_set_option(FDBCluster* cluster, FDBClusterOption option, uint8_t const* value, int value_length)
|
||||
|
||||
Called to set an option on an :type:`FDBCluster`. |option-parameter| :func:`fdb_cluster_set_option()` returns.
|
||||
|
||||
.. type:: FDBClusterOption
|
||||
|
||||
|option-doc|
|
||||
|
||||
.. function:: FDBFuture* fdb_cluster_create_database(FDBCluster *cluster, uint8_t const* db_name, int db_name_length)
|
||||
|
||||
|future-return0| an :type:`FDBDatabase` object. |future-return1| call :func:`fdb_future_get_database()` to extract the :type:`FDBDatabase` object, |future-return2|
|
||||
|
||||
:data:`db_name`
|
||||
A pointer to the name of the database to be opened. |no-null| In the current FoundationDB API, the database name *must* be "DB".
|
||||
|
||||
:data:`db_name_length`
|
||||
|length-of| :data:`db_name`.
|
||||
|
||||
Database
|
||||
========
|
||||
|
||||
An |database-blurb1| Modifications to a database are performed via transactions.
|
||||
|
||||
.. type:: FDBDatabase
|
||||
|
||||
An opaque type that represents a database in the FoundationDB C API.
|
||||
|
||||
.. function:: void fdb_database_destroy(FDBDatabase* database)
|
||||
|
||||
Destroys an :type:`FDBDatabase` object. It must be called exactly once for each successful call to :func:`fdb_future_get_database()`. This function only destroys a handle to the database -- your database will be fine!
|
||||
|
||||
.. function:: fdb_error_t fdb_database_set_option(FDBDatabase* database, FDBDatabaseOption option, uint8_t const* value, int value_length)
|
||||
|
||||
Called to set an option an on :type:`FDBDatabase`. |option-parameter| :func:`fdb_database_set_option()` returns.
|
||||
|
||||
.. type:: FDBDatabaseOption
|
||||
|
||||
|option-doc|
|
||||
|
||||
.. function:: fdb_error_t fdb_database_create_transaction(FDBDatabase* database, FDBTransaction** out_transaction)
|
||||
|
||||
Creates a new transaction on the given database. The caller assumes ownership of the :type:`FDBTransaction` object and must destroy it with :func:`fdb_transaction_destroy()`.
|
||||
|
||||
:data:`*out_transaction`
|
||||
Set to point to the newly created :type:`FDBTransaction`.
|
||||
|
||||
Transaction
|
||||
===========
|
||||
|
||||
|transaction-blurb1|
|
||||
|
||||
Applications must provide error handling and an appropriate retry loop around the application code for a transaction. See the documentation for :func:`fdb_transaction_on_error()`.
|
||||
|
||||
|transaction-blurb2|
|
||||
|
||||
|transaction-blurb3|
|
||||
|
||||
.. type:: FDBTransaction
|
||||
|
||||
An opaque type that represents a transaction in the FoundationDB C API.
|
||||
|
||||
.. function:: void fdb_transaction_destroy(FDBTransaction* transaction)
|
||||
|
||||
Destroys an :type:`FDBTransaction` object. It must be called exactly once for each successful call to :func:`fdb_database_create_transaction()`. Destroying a transaction which has not had :func:`fdb_transaction_commit()` called implicitly "rolls back" the transaction (sets and clears do not take effect on the database).
|
||||
|
||||
.. function:: fdb_error_t fdb_transaction_set_option(FDBTransaction* transaction, FDBTransactionOption option, uint8_t const* value, int value_length)
|
||||
|
||||
Called to set an option on an :type:`FDBTransaction`. |option-parameter| :func:`fdb_transaction_set_option()` returns.
|
||||
|
||||
.. type:: FDBTransactionOption
|
||||
|
||||
|option-doc|
|
||||
|
||||
.. function:: void fdb_transaction_set_read_version(FDBTransaction* transaction, int64_t version)
|
||||
|
||||
Sets the snapshot read version used by a transaction. This is not needed in simple cases. If the given version is too old, subsequent reads will fail with error_code_past_version; if it is too new, subsequent reads may be delayed indefinitely and/or fail with error_code_future_version. If any of :func:`fdb_transaction_get_*()` have been called on this transaction already, the result is undefined.
|
||||
|
||||
.. function:: FDBFuture* fdb_transaction_get_read_version(FDBTransaction* transaction)
|
||||
|
||||
|future-return0| the transaction snapshot read version. |future-return1| call :func:`fdb_future_get_version()` to extract the version into an int64_t that you provide, |future-return2|
|
||||
|
||||
The transaction obtains a snapshot read version automatically at the time of the first call to :func:`fdb_transaction_get_*()` (including this one) and (unless causal consistency has been deliberately compromised by transaction options) is guaranteed to represent all transactions which were reported committed before that call.
|
||||
|
||||
.. function:: FDBFuture* fdb_transaction_get(FDBTransaction* transaction, uint8_t const* key_name, int key_name_length, fdb_bool_t snapshot)
|
||||
|
||||
Reads a value from the database snapshot represented by :data:`transaction`.
|
||||
|
||||
|future-return0| the value of :data:`key_name` in the database. |future-return1| call :func:`fdb_future_get_value()` to extract the value, |future-return2|
|
||||
|
||||
See :func:`fdb_future_get_value()` to see exactly how results are unpacked. If :data:`key_name` is not present in the database, the result is not an error, but a zero for :data:`*out_present` returned from that function.
|
||||
|
||||
:data:`key_name`
|
||||
A pointer to the name of the key to be looked up in the database. |no-null|
|
||||
|
||||
:data:`key_name_length`
|
||||
|length-of| :data:`key_name`.
|
||||
|
||||
:data:`snapshot`
|
||||
|snapshot|
|
||||
|
||||
.. function:: FDBFuture* fdb_transaction_get_key(FDBTransaction* transaction, uint8_t const* key_name, int key_name_length, fdb_bool_t or_equal, int offset, fdb_bool_t snapshot)
|
||||
|
||||
Resolves a :ref:`key selector <key-selectors>` against the keys in the database snapshot represented by :data:`transaction`.
|
||||
|
||||
|future-return0| the key in the database matching the :ref:`key selector <key-selectors>`. |future-return1| call :func:`fdb_future_get_key()` to extract the key, |future-return2|
|
||||
|
||||
:data:`key_name`, :data:`key_name_length`, :data:`or_equal`, :data:`offset`
|
||||
The four components of a :ref:`key selector <key-selectors>`.
|
||||
|
||||
:data:`snapshot`
|
||||
|snapshot|
|
||||
|
||||
.. function:: FDBFuture* fdb_transaction_get_addresses_for_key(FDBTransaction* transaction, uint8_t const* key_name, int key_name_length)
|
||||
|
||||
Returns a list of public network addresses as strings, one for each of the storage servers responsible for storing :data:`key_name` and its associated value.
|
||||
|
||||
|future-return0| an array of strings. |future-return1| call :func:`fdb_future_get_string_array()` to extract the string array, |future-return2|
|
||||
|
||||
:data:`key_name`
|
||||
A pointer to the name of the key whose location is to be queried.
|
||||
|
||||
:data:`key_name_length`
|
||||
|length-of| :data:`key_name`.
|
||||
|
||||
.. |range-limited-by| replace:: If this limit was reached before the end of the specified range, then the :data:`*more` return of :func:`fdb_future_get_keyvalue_array()` will be set to a non-zero value.
|
||||
|
||||
.. function:: FDBFuture* fdb_transaction_get_range(FDBTransaction* transaction, uint8_t const* begin_key_name, int begin_key_name_length, fdb_bool_t begin_or_equal, int begin_offset, uint8_t const* end_key_name, int end_key_name_length, fdb_bool_t end_or_equal, int end_offset, int limit, int target_bytes, FDBStreamingMode mode, int iteration, fdb_bool_t snapshot, fdb_bool_t reverse)
|
||||
|
||||
Reads all key-value pairs in the database snapshot represented by :data:`transaction` (potentially limited by :data:`limit`, :data:`target_bytes`, or :data:`mode`) which have a key lexicographically greater than or equal to the key resolved by the begin :ref:`key selector <key-selectors>` and lexicographically less than the key resolved by the end :ref:`key selector <key-selectors>`.
|
||||
|
||||
|future-return0| an :type:`FDBKeyValue` array. |future-return1| call :func:`fdb_future_get_keyvalue_array()` to extract the key-value array, |future-return2|
|
||||
|
||||
:data:`begin_key_name`, :data:`begin_key_name_length`, :data:`begin_or_equal`, :data:`begin_offset`
|
||||
The four components of a :ref:`key selector <key-selectors>` describing the beginning of the range.
|
||||
|
||||
:data:`end_key_name`, :data:`end_key_name_length`, :data:`end_or_equal`, :data:`end_offset`
|
||||
The four components of a :ref:`key selector <key-selectors>` describing the end of the range.
|
||||
|
||||
:data:`limit`
|
||||
If non-zero, indicates the maximum number of key-value pairs to return. |range-limited-by|
|
||||
|
||||
:data:`target_bytes`
|
||||
If non-zero, indicates a (soft) cap on the combined number of bytes of keys and values to return. |range-limited-by|
|
||||
|
||||
:data:`mode`
|
||||
One of the :type:`FDBStreamingMode` values indicating how the caller would like the data in the range returned.
|
||||
|
||||
:data:`iteration`
|
||||
If :data:`mode` is :data:`FDB_STREAMING_MODE_ITERATOR`, this parameter should start at 1 and be incremented by 1 for each successive call while reading this range. In all other cases it is ignored.
|
||||
|
||||
:data:`snapshot`
|
||||
|snapshot|
|
||||
|
||||
:data:`reverse`
|
||||
|
||||
If non-zero, key-value pairs will be returned in reverse lexicographical order beginning at the end of the range.
|
||||
|
||||
.. type:: FDBStreamingMode
|
||||
|
||||
An enumeration of available streaming modes to be passed to :func:`fdb_transaction_get_range()`.
|
||||
|
||||
:data:`FDB_STREAMING_MODE_ITERATOR`
|
||||
|
||||
The caller is implementing an iterator (most likely in a binding to a higher level language). The amount of data returned depends on the value of the :data:`iteration` parameter to :func:`fdb_transaction_get_range()`.
|
||||
|
||||
:data:`FDB_STREAMING_MODE_SMALL`
|
||||
|
||||
Data is returned in small batches (not much more expensive than reading individual key-value pairs).
|
||||
|
||||
:data:`FDB_STREAMING_MODE_MEDIUM`
|
||||
|
||||
Data is returned in batches between _SMALL and _LARGE.
|
||||
|
||||
:data:`FDB_STREAMING_MODE_LARGE`
|
||||
|
||||
Data is returned in batches large enough to be, in a high-concurrency environment, nearly as efficient as possible. If the caller does not need the entire range, some disk and network bandwidth may be wasted. The batch size may be still be too small to allow a single client to get high throughput from the database.
|
||||
|
||||
:data:`FDB_STREAMING_MODE_SERIAL`
|
||||
|
||||
Data is returned in batches large enough that an individual client can get reasonable read bandwidth from the database. If the caller does not need the entire range, considerable disk and network bandwidth may be wasted.
|
||||
|
||||
:data:`FDB_STREAMING_MODE_WANT_ALL`
|
||||
|
||||
The caller intends to consume the entire range and would like it all transferred as early as possible.
|
||||
|
||||
:data:`FDB_STREAMING_MODE_EXACT`
|
||||
|
||||
The caller has passed a specific row limit and wants that many rows delivered in a single batch.
|
||||
|
||||
.. function:: void fdb_transaction_set(FDBTransaction* transaction, uint8_t const* key_name, int key_name_length, uint8_t const* value, int value_length)
|
||||
|
||||
|sets-and-clears1| to change the given key to have the given value. If the given key was not previously present in the database it is inserted.
|
||||
|
||||
|sets-and-clears2|
|
||||
|
||||
:data:`key_name`
|
||||
A pointer to the name of the key to be inserted into the database. |no-null|
|
||||
|
||||
:data:`key_name_length`
|
||||
|length-of| :data:`key_name`.
|
||||
|
||||
:data:`value`
|
||||
A pointer to the value to be inserted into the database. |no-null|
|
||||
|
||||
:data:`value_length`
|
||||
|length-of| :data:`value`.
|
||||
|
||||
.. function:: void fdb_transaction_clear(FDBTransaction* transaction, uint8_t const* key_name, int key_name_length)
|
||||
|
||||
|sets-and-clears1| to remove the given key from the database. If the key was not previously present in the database, there is no effect.
|
||||
|
||||
|sets-and-clears2|
|
||||
|
||||
:data:`key_name`
|
||||
A pointer to the name of the key to be removed from the database. |no-null|
|
||||
|
||||
:data:`key_name_length`
|
||||
|length-of| :data:`key_name`.
|
||||
|
||||
.. function:: void fdb_transaction_clear_range(FDBTransaction* transaction, uint8_t const* begin_key_name, int begin_key_name_length, uint8_t const* end_key_name, int end_key_name_length)
|
||||
|
||||
|sets-and-clears1| to remove all keys (if any) which are lexicographically greater than or equal to the given begin key and lexicographically less than the given end_key.
|
||||
|
||||
|sets-and-clears2|
|
||||
|
||||
:data:`begin_key_name`
|
||||
A pointer to the name of the key specifying the beginning of the range to clear. |no-null|
|
||||
|
||||
:data:`begin_key_name_length`
|
||||
|length-of| :data:`begin_key_name`.
|
||||
|
||||
:data:`end_key_name`
|
||||
A pointer to the name of the key specifying the end of the range to clear. |no-null|
|
||||
|
||||
:data:`end_key_name_length`
|
||||
|length-of| :data:`end_key_name_length`.
|
||||
|
||||
.. function:: void fdb_transaction_atomic_op(FDBTransaction* transaction, uint8_t const* key_name, int key_name_length, uint8_t const* param, int param_length, FDBMutationType operationType)
|
||||
|
||||
|sets-and-clears1| to perform the operation indicated by ``operationType`` with operand ``param`` to the value stored by the given key.
|
||||
|
||||
|atomic-ops-blurb1|
|
||||
|
||||
|atomic-ops-blurb2|
|
||||
|
||||
|atomic-ops-blurb3|
|
||||
|
||||
.. warning :: |atomic-ops-warning|
|
||||
|
||||
|sets-and-clears2|
|
||||
|
||||
:data:`key_name`
|
||||
A pointer to the name of the key whose value is to be mutated.
|
||||
|
||||
:data:`key_name_length`
|
||||
|length-of| :data:`key_name`.
|
||||
|
||||
:data:`param`
|
||||
A pointer to the parameter with which the atomic operation will mutate the value associated with :data:`key_name`.
|
||||
|
||||
:data:`param_length`
|
||||
|length-of| :data:`param`.
|
||||
|
||||
:data:`operation_type`
|
||||
One of the :type:`FDBMutationType` values indicating which operation should be performed.
|
||||
|
||||
.. type:: FDBMutationType
|
||||
|
||||
An enumeration of available opcodes to be passed to :func:`fdb_transaction_atomic_op()`
|
||||
|
||||
:data:`FDB_MUTATION_TYPE_ADD`
|
||||
|
||||
|atomic-add1|
|
||||
|
||||
|atomic-add2|
|
||||
|
||||
:data:`FDB_MUTATION_TYPE_AND`
|
||||
|
||||
|atomic-and|
|
||||
|
||||
:data:`FDB_MUTATION_TYPE_OR`
|
||||
|
||||
|atomic-or|
|
||||
|
||||
:data:`FDB_MUTATION_TYPE_XOR`
|
||||
|
||||
|atomic-xor|
|
||||
|
||||
:data:`FDB_MUTATION_TYPE_MAX`
|
||||
|
||||
|atomic-max1|
|
||||
|
||||
|atomic-max-min|
|
||||
|
||||
:data:`FDB_MUTATION_TYPE_BYTE_MAX`
|
||||
|
||||
|atomic-byte-max|
|
||||
|
||||
:data:`FDB_MUTATION_TYPE_MIN`
|
||||
|
||||
|atomic-min1|
|
||||
|
||||
|atomic-max-min|
|
||||
|
||||
:data:`FDB_MUTATION_TYPE_BYTE_MIN`
|
||||
|
||||
|atomic-byte-min|
|
||||
|
||||
:data:`FDB_MUTATION_TYPE_SET_VERSIONSTAMPED_KEY`
|
||||
|
||||
|atomic-set-versionstamped-key-1|
|
||||
|
||||
|atomic-versionstamps-1|
|
||||
|
||||
|atomic-versionstamps-2|
|
||||
|
||||
|atomic-set-versionstamped-key-2|
|
||||
|
||||
.. warning :: |atomic-versionstamps-tuple-warning-key|
|
||||
|
||||
:data:`FDB_MUTATION_TYPE_SET_VERSIONSTAMPED_VALUE`
|
||||
|
||||
|atomic-set-versionstamped-value|
|
||||
|
||||
|atomic-versionstamps-1|
|
||||
|
||||
|atomic-versionstamps-2|
|
||||
|
||||
.. warning :: |atomic-versionstamps-tuple-warning-value|
|
||||
|
||||
.. function:: FDBFuture* fdb_transaction_commit(FDBTransaction* transaction)
|
||||
|
||||
Attempts to commit the sets and clears previously applied to the database snapshot represented by :data:`transaction` to the actual database. The commit may or may not succeed -- in particular, if a conflicting transaction previously committed, then the commit must fail in order to preserve transactional isolation. If the commit does succeed, the transaction is durably committed to the database and all subsequently started transactions will observe its effects.
|
||||
|
||||
It is not necessary to commit a read-only transaction -- you can simply call :func:`fdb_transaction_destroy()`.
|
||||
|
||||
|future-returnvoid|
|
||||
|
||||
Callers will usually want to retry a transaction if the commit or a prior :func:`fdb_transaction_get_*()` returns a retryable error (see :func:`fdb_transaction_on_error()`).
|
||||
|
||||
|commit-unknown-result-blurb|
|
||||
|
||||
|commit-outstanding-reads-blurb|
|
||||
|
||||
.. function:: fdb_error_t fdb_transaction_get_committed_version(FDBTransaction* transaction, int64_t* out_version)
|
||||
|
||||
Retrieves the database version number at which a given transaction was committed. :func:`fdb_transaction_commit()` must have been called on :data:`transaction` and the resulting future must be ready and not an error before this function is called, or the behavior is undefined. Read-only transactions do not modify the database when committed and will have a committed version of -1. Keep in mind that a transaction which reads keys and then sets them to their current values may be optimized to a read-only transaction.
|
||||
|
||||
Note that database versions are not necessarily unique to a given transaction and so cannot be used to determine in what order two transactions completed. The only use for this function is to manually enforce causal consistency when calling :func:`fdb_transaction_set_read_version()` on another subsequent transaction.
|
||||
|
||||
Most applications will not call this function.
|
||||
|
||||
.. function:: FDBFuture* fdb_transaction_get_versionstamp(FDBTransaction* transaction)
|
||||
|
||||
|future-return0| the versionstamp which was used by any versionstamp operations in this transaction. |future-return1| call :func:`fdb_future_get_key()` to extract the key, |future-return2|
|
||||
|
||||
The future will be ready only after the successful completion of a call to |commit-func| on this Transaction. Read-only transactions do not modify the database when committed and will result in the future completing with an error. Keep in mind that a transaction which reads keys and then sets them to their current values may be optimized to a read-only transaction.
|
||||
|
||||
Most applications will not call this function.
|
||||
|
||||
.. function:: FDBFuture* fdb_transaction_watch(FDBTransaction* transaction, uint8_t const* key_name, int key_name_length)
|
||||
|
||||
|transaction-watch-blurb|
|
||||
|
||||
|transaction-watch-committed-blurb|
|
||||
|
||||
|transaction-watch-error-blurb|
|
||||
|
||||
|future-returnvoid0| that will be set once the watch has detected a change to the value at the specified key. |future-return1| |future-return2|
|
||||
|
||||
|transaction-watch-limit-blurb|
|
||||
|
||||
:data:`key_name`
|
||||
A pointer to the name of the key to watch. |no-null|
|
||||
|
||||
:data:`key_name_length`
|
||||
|length-of| :data:`key_name`.
|
||||
|
||||
|
||||
.. function:: FDBFuture* fdb_transaction_on_error(FDBTransaction* transaction, fdb_error_t error)
|
||||
|
||||
Implements the recommended retry and backoff behavior for a transaction. This function knows which of the error codes generated by other :func:`fdb_transaction_*()` functions represent temporary error conditions and which represent application errors that should be handled by the application. It also implements an exponential backoff strategy to avoid swamping the database cluster with excessive retries when there is a high level of conflict between transactions.
|
||||
|
||||
On receiving any type of error from an :func:`fdb_transaction_*()` function, the application should:
|
||||
|
||||
1. Call :func:`fdb_transaction_on_error()` with the returned :type:`fdb_error_t` code.
|
||||
|
||||
2. Wait for the resulting future to be ready.
|
||||
|
||||
3. If the resulting future is itself an error, destroy the future and FDBTransaction and report the error in an appropriate way.
|
||||
|
||||
4. If the resulting future is not an error, destroy the future and restart the application code that performs the transaction. The transaction itself will have already been reset to its initial state, but should not be destroyed and re-created because state used by :func:`fdb_transaction_on_error()` to implement its backoff strategy and state related to timeouts and retry limits is stored there.
|
||||
|
||||
|future-returnvoid|
|
||||
|
||||
.. function:: void fdb_transaction_reset(FDBTransaction* transaction)
|
||||
|
||||
Reset :data:`transaction` to its initial state. This is similar to calling :func:`fdb_transaction_destroy()` followed by :func:`fdb_database_create_transaction()`. It is not necessary to call :func:`fdb_transaction_reset()` when handling an error with :func:`fdb_transaction_on_error()` since the transaction has already been reset.
|
||||
|
||||
.. function:: void fdb_transaction_cancel(FDBTransaction* transaction)
|
||||
|
||||
|transaction-cancel-blurb|
|
||||
|
||||
.. warning :: |transaction-reset-cancel-warning|
|
||||
|
||||
.. warning :: |transaction-commit-cancel-warning|
|
||||
|
||||
.. _conflictRanges:
|
||||
|
||||
.. function:: fdb_error_t fdb_transaction_add_conflict_range(FDBTransaction* transaction, uint8_t const* begin_key_name, int begin_key_name_length, uint8_t const* end_key_name, int end_key_name_length, FDBConflictRangeType type)
|
||||
|
||||
Adds a :ref:`conflict range <conflict-ranges>` to a transaction without performing the associated read or write.
|
||||
|
||||
.. note:: |conflict-range-note|
|
||||
|
||||
:data:`begin_key_name`
|
||||
A pointer to the name of the key specifying the beginning of the conflict range. |no-null|
|
||||
|
||||
:data:`begin_key_name_length`
|
||||
|length-of| :data:`begin_key_name`.
|
||||
|
||||
:data:`end_key_name`
|
||||
A pointer to the name of the key specifying the end of the conflict range. |no-null|
|
||||
|
||||
:data:`end_key_name_length`
|
||||
|length-of| :data:`end_key_name_length`.
|
||||
|
||||
:data:`type`
|
||||
One of the :type:`FDBConflictRangeType` values indicating what type of conflict range is being set.
|
||||
|
||||
.. type:: FDBConflictRangeType
|
||||
|
||||
An enumeration of available conflict range types to be passed to :func:`fdb_transaction_add_conflict_range()`.
|
||||
|
||||
:data:`FDB_CONFLICT_RANGE_TYPE_READ`
|
||||
|
||||
|add-read-conflict-range-blurb|
|
||||
|
||||
:data:`FDB_CONFLICT_RANGE_TYPE_WRITE`
|
||||
|
||||
|add-write-conflict-range-blurb|
|
||||
|
||||
.. _snapshots:
|
||||
|
||||
Snapshot reads
|
||||
--------------
|
||||
|
||||
|snapshot-blurb1|
|
||||
|
||||
|snapshot-blurb2|
|
||||
|
||||
|snapshot-blurb3|
|
||||
|
||||
In the C API, snapshot reads are performed by passing a non-zero value to the ``snapshot`` parameter of any of ``fdb_transaction_get_*`` (see for example :func:`fdb_transaction_get()`). |snapshot-blurb4|
|
||||
|
||||
.. _key-selectors:
|
||||
|
||||
Key selectors
|
||||
=============
|
||||
|
||||
|keysel-blurb1|
|
||||
|
||||
|keysel-blurb2|
|
||||
|
||||
In the FoundationDB C API, key selectors are not represented by a structure of any kind, but are instead expressed as sequential parameters to |get-key-func| and |get-range-func|. For convenience, the most common key selectors are available as C macros that expand to the appropriate parameters.
|
||||
|
||||
.. function:: FDB_KEYSEL_LAST_LESS_THAN(key_name, key_name_length)
|
||||
|
||||
.. function:: FDB_KEYSEL_LAST_LESS_OR_EQUAL(key_name, key_name_length)
|
||||
|
||||
.. function:: FDB_KEYSEL_FIRST_GREATER_THAN(key_name, key_name_length)
|
||||
|
||||
.. function:: FDB_KEYSEL_FIRST_GREATER_OR_EQUAL(key_name, key_name_length)
|
||||
|
||||
To use one of these macros, simply replace the four parameters in the function with one of :func:`FDB_KEYSEL_*`::
|
||||
|
||||
future = fdb_transaction_get_key(transaction, "key", 3, 0, 2, 0);
|
||||
|
||||
could instead be written as::
|
||||
|
||||
future = fdb_transaction_get_key(transaction, FDB_KEYSEL_FIRST_GREATER_THAN("key", 3)+1, 0);
|
||||
|
||||
Miscellaneous
|
||||
=============
|
||||
|
||||
.. type:: fdb_bool_t
|
||||
|
||||
An integer type representing a boolean. A value of 0 is false and non-zero is true.
|
||||
|
||||
.. type:: fdb_error_t
|
||||
|
||||
An integer type representing an error. A value of 0 is success and non-zero is an error.
|
||||
|
||||
.. function:: const char* fdb_get_error(fdb_error_t code)
|
||||
|
||||
Returns a (somewhat) human-readable English message from an error code. The return value is a statically allocated null-terminated string that *must not* be freed by the caller.
|
||||
|
||||
.. function:: fdb_bool_t fdb_error_predicate(int predicate_test, fdb_error_t code)
|
||||
|
||||
Evaluates a predicate against an error code. The predicate to run should be one of the codes listed by the ``FDBErrorPredicate`` enum defined within ``fdb_c_options.g.h``. Sample predicates include ``FDB_ERROR_PREDICATE_RETRYABLE``, which can be used to determine whether the error with the given code is a retryable error or not.
|
|
@ -0,0 +1,504 @@
|
|||
.. -*- mode: rst; -*-
|
||||
|
||||
.. |separately-installed-bindings| replace::
|
||||
For the language binding to function, FoundationDB client binaries whose version is at least as recent
|
||||
must be installed. If you upgrade a language binding to a new version, you may need to upgrade the FoundationDB client binaries as well. See :ref:`installing-client-binaries`.
|
||||
|
||||
.. |project-dependency| replace::
|
||||
If you have a project with automatic dependency installation and have expressed a dependency on foundationdb, it may automatically install the lastest version of the language binding when you deploy your project to a new machine. If you have not also upgraded the Foundation client binary, an unplanned upgrade of the language binding may encounter an incompatibility. You should therefore configure any project dependency on foundationdb in coordination with your overall upgrade strategy.
|
||||
|
||||
.. |client-installed-bindings| replace::
|
||||
The language binding requires FoundationDB client binaries whose version is at least as recent. The binding installed with FoundationDB installation will automatically satisfy this requirement.
|
||||
|
||||
.. |api-version-rationale| replace::
|
||||
FoundationDB encapsulates multiple versions of its interface by requiring the client to explicitly specify the version of the API it uses. The purpose of this design is to allow you to upgrade the server, client libraries, or bindings without having to modify client code. The client libraries support all previous versions of the API. The API version specified by the client is used to control the behavior of the binding. You can therefore upgrade to more recent packages (and thus receive various improvements) without having to change your code.
|
||||
|
||||
.. |api-version-multi-version-warning| replace::
|
||||
When using the :ref:`multi-version client API <multi-version-client-api>`, setting an API version that is not supported by a particular client library will prevent that client from being used to connect to the cluster. In particular, you should not advance the API version of your application after upgrading your client until the cluster has also been upgraded.
|
||||
|
||||
.. |transaction-blurb1| replace::
|
||||
In FoundationDB, a transaction is a mutable snapshot of a database. All read and write operations on a transaction see and modify an otherwise-unchanging version of the database and only change the underlying database if and when the transaction is committed. Read operations do see the effects of previous write operations on the same transaction. Committing a transaction usually succeeds in the absence of :ref:`conflicts <conflict-ranges>`.
|
||||
|
||||
.. |transaction-blurb2| replace::
|
||||
Transactions group operations into a unit with the properties of *atomicity*, *isolation*, and *durability*. Transactions also provide the ability to maintain an application's invariants or integrity constraints, supporting the property of *consistency*. Together these properties are known as :ref:`ACID <ACID>`.
|
||||
|
||||
.. |transaction-blurb3| replace::
|
||||
Transactions are also causally consistent: once a transaction has been successfully committed, all subsequently created transactions will see the modifications made by it.
|
||||
|
||||
.. |used-during-commit-blurb| replace::
|
||||
If any operation is performed on a transaction after a commit has been issued but before it has returned, both the commit and the operation will |error-raise-type| a :ref:`used_during_commit <developer-guide-error-codes>` |error-type|. In this case, all subsequent operations on this transaction will |error-raise-type| this error until |reset-func-name| is called.
|
||||
|
||||
.. |unknown-result-blurb| replace::
|
||||
As with other client/server databases, in some failure scenarios a client may be unable to determine whether a transaction succeeded. In these cases, |commit-func| will |error-raise-type| a :ref:`commit_unknown_result <developer-guide-error-codes>` |error-type|. The |on-error-func| function treats this |error-type| as retryable, so retry loops that don't check for :ref:`commit_unknown_result <developer-guide-error-codes>` could execute the transaction twice. In these cases, you must consider the idempotence of the transaction.
|
||||
|
||||
.. |commit-unknown-result-blurb| replace::
|
||||
|
||||
|unknown-result-blurb| For more information, see :ref:`developer-guide-unknown-results`.
|
||||
|
||||
.. |commit-outstanding-reads-blurb| replace::
|
||||
Normally, commit will wait for outstanding reads to return. However, if those reads were snapshot reads or the transaction option for disabling "read-your-writes" has been invoked, any outstanding reads will immediately return errors.
|
||||
|
||||
.. |transaction-cancel-blurb| replace::
|
||||
Cancels the transaction. All pending or future uses of the transaction will |error-raise-type| a :ref:`transaction_cancelled <developer-guide-error-codes>` |error-type|. The transaction can be used again after it is |reset-func-name|.
|
||||
|
||||
.. |snapshot-blurb1| replace::
|
||||
Snapshot reads selectively relax FoundationDB's isolation property, reducing :ref:`conflicts <developer-guide-transaction-conflicts>` but making it harder to reason about concurrency.
|
||||
|
||||
.. |snapshot-blurb2| replace::
|
||||
By default, FoundationDB transactions guarantee :ref:`serializable isolation <ACID>`, resulting in a state that is *as if* transactions were executed one at a time, even if they were executed concurrently. Serializability has little performance cost when there are few :ref:`conflicts <developer-guide-transaction-conflicts>` but can be expensive when there are many. FoundationDB therefore also permits individual reads within a transaction to be done as snapshot reads.
|
||||
|
||||
.. |snapshot-blurb3| replace::
|
||||
Snapshot reads differ from ordinary (serializable) reads by permitting the values they read to be modified by concurrent transactions, whereas serializable reads cause conflicts in that case. Like serializable reads, snapshot reads see the effects of prior writes in the same transaction. For more information on the use of snapshot reads, see :ref:`snapshot isolation`.
|
||||
|
||||
.. |snapshot-blurb4| replace::
|
||||
Snapshot reads also interact with transaction commit a little differently than normal reads. If a snapshot read is outstanding when transaction commit is called that read will immediately return an error. (Normally, transaction commit will wait until outstanding reads return before committing.)
|
||||
|
||||
.. |keys-values-blurb| replace::
|
||||
Keys and values in FoundationDB are simple byte strings.
|
||||
|
||||
.. |keys-values-other-types-blurb| replace::
|
||||
To encode other data types, see :ref:`encoding-data-types` and the |tuple-layer|.
|
||||
|
||||
.. |as-foundationdb-blurb| replace::
|
||||
In some cases, you may have objects that are used to *represent* specific keys or values (for example, see |subspace|). As a convenience, the language binding API can work seamlessly with such objects if they implement the |as-foundationdb-key| or |as-foundationdb-value| methods, respectively. API methods that accept a key will alternately accept an object that implements the |as-foundationdb-key| method. Likewise, API methods accepting a value will also accept an object that implements the |as-foundationdb-value| method.
|
||||
|
||||
.. |as-foundationdb-warning| replace::
|
||||
|as-foundationdb-key| and |as-foundationdb-value| are not intended to implement serialization protocols for object storage. Use these functions only when your object represents a specific key or value.
|
||||
|
||||
.. |database-blurb1| replace::
|
||||
|database-type| represents a FoundationDB database --- a mutable, lexicographically ordered mapping from binary keys to binary values.
|
||||
|
||||
.. |database-blurb2| replace::
|
||||
Although |database-type| provides convenience methods for reading and writing, modifications to a database are usually via transactions, which are usually created and committed automatically by |database-auto|.
|
||||
|
||||
.. |database-sync| replace::
|
||||
The convenience methods provided by |database-type| have the same signature as the corresponding methods of ``Transaction``. However, most of the |database-type| methods are fully synchronous. (An exception is the methods for watches.) As a result, the |database-type| methods do not support the use of :ref:`implicit parallelism with futures <developer-guide-programming-with-futures>`.
|
||||
|
||||
.. |keysel-blurb1| replace::
|
||||
FoundationDB's lexicographically ordered data model permits finding keys based on their order (for example, finding the first key in the database greater than a given key). Key selectors represent a description of a key in the database that could be resolved to an actual key by |get-key-func| or used directly as the beginning or end of a range in |get-range-func|.
|
||||
|
||||
.. |keysel-blurb2| replace::
|
||||
For more about how key selectors work, see :ref:`key selectors`.
|
||||
|
||||
.. |database-atomic-ops-idempotency-note| replace::
|
||||
|
||||
Note that since some atomic operations are not idempotent, the implicit use of |database-auto| could interact with a :ref:`commit_unknown_result <developer-guide-error-codes>` |error-type| in unpredictable ways. For more information, see :ref:`developer-guide-unknown-results`.
|
||||
|
||||
.. |atomic-ops-blurb1| replace::
|
||||
An atomic operation is a single database command that carries out several logical steps: reading the value of a key, performing a transformation on that value, and writing the result. Different atomic operations perform different transformations. Like other database operations, an atomic operation is used within a transaction; however, its use within a transaction will not cause the transaction to conflict.
|
||||
|
||||
.. |atomic-ops-blurb2| replace::
|
||||
Atomic operations do not expose the current value of the key to the client but simply send the database the transformation to apply. In regard to conflict checking, an atomic operation is equivalent to a write without a read. It can only cause *other* transactions performing reads of the key to conflict.
|
||||
|
||||
.. |atomic-ops-blurb3| replace::
|
||||
By combining these logical steps into a single, read-free operation, FoundationDB can guarantee that the transaction will not conflict due to the operation. This makes atomic operations ideal for operating on keys that are frequently modified. A common example is the use of a key-value pair as a counter.
|
||||
|
||||
.. |atomic-ops-warning| replace::
|
||||
If a transaction uses both an atomic operation and a serializable read on the same key, the benefits of using the atomic operation (for both conflict checking and performance) are lost.
|
||||
|
||||
.. |atomic-add1| replace::
|
||||
Performs an addition of little-endian integers. If the existing value in the database is not present or shorter than ``param``, it is first extended to the length of ``param`` with zero bytes. If ``param`` is shorter than the existing value in the database, the existing value is truncated to match the length of ``param``.
|
||||
|
||||
.. |atomic-add2| replace::
|
||||
The integers to be added must be stored in a little-endian representation. They can be signed in two's complement representation or unsigned. You can add to an integer at a known offset in the value by prepending the appropriate number of zero bytes to ``param`` and padding with zero bytes to match the length of the value. However, this offset technique requires that you know the addition will not cause the integer field within the value to overflow.
|
||||
|
||||
.. |atomic-and| replace::
|
||||
Performs a bitwise "and" operation. If the existing value in the database is not present, then ``param`` is stored in the database. If the existing value in the database is shorter than ``param``, it is first extended to the length of ``param`` with zero bytes. If ``param`` is shorter than the existing value in the database, the existing value is truncated to match the length of ``param``.
|
||||
|
||||
.. |atomic-or| replace::
|
||||
Performs a bitwise "or" operation. If the existing value in the database is not present or shorter than ``param``, it is first extended to the length of ``param`` with zero bytes. If ``param`` is shorter than the existing value in the database, the existing value is truncated to match the length of ``param``.
|
||||
|
||||
.. |atomic-xor| replace::
|
||||
Performs a bitwise "xor" operation. If the existing value in the database is not present or shorter than ``param``, it is first extended to the length of ``param`` with zero bytes. If ``param`` is shorter than the existing value in the database, the existing value is truncated to match the length of ``param``.
|
||||
|
||||
.. |atomic-max1| replace::
|
||||
Sets the value in the database to the larger of the existing value and ``param``. If the existing value in the database is not present or shorter than ``param``, it is first extended to the length of ``param`` with zero bytes. If ``param`` is shorter than the existing value in the database, the existing value is truncated to match the length of ``param``.
|
||||
|
||||
.. |atomic-max-min| replace::
|
||||
Both the existing value and ``param`` are treated as unsigned integers. (This differs from the behavior of atomic addition.)
|
||||
|
||||
.. |atomic-min1| replace::
|
||||
Sets the value in the database to the smaller of the existing value and ``param``. If the existing value in the database is not present, then ``param`` is stored in the database. If the existing value in the database is shorter than ``param``, it is first extended to the length of ``param`` with zero bytes. If ``param`` is shorter than the existing value in the database, the existing value is truncated to match the length of ``param``.
|
||||
|
||||
.. |atomic-byte-min| replace::
|
||||
Performs lexicographic comparison of byte strings. If the existing value in the database is not present, then ``param`` is stored. Otherwise the smaller of the two values is then stored in the database.
|
||||
|
||||
.. |atomic-byte-max| replace::
|
||||
Performs lexicographic comparison of byte strings. If the existing value in the database is not present, then ``param`` is stored. Otherwise the larger of the two values is then stored in the database.
|
||||
|
||||
.. |atomic-set-versionstamped-key-1| replace::
|
||||
Transforms ``key`` using a versionstamp for the transaction. This key must be at least 12 bytes long. The final 2 bytes will be interpreted as a 16-bit little-endian integer denoting an index into the key at which to perform the transformation, and then trimmed off the key. The 10 bytes in the key beginning at the index will be overwritten with the versionstamp. If the index plus 10 bytes points past the end of the key, the result will be an error. Sets the transformed key in the database to ``param``.
|
||||
|
||||
.. |atomic-set-versionstamped-key-2| replace::
|
||||
This operation is not compatible with |read-your-writes-disable-option| and will generate an error if used with it.
|
||||
|
||||
.. |atomic-set-versionstamped-value| replace::
|
||||
Transforms ``param`` using a versionstamp for the transaction. This parameter must be at least 10 bytes long, and the first 10 bytes will be overwritten with the versionstamp. Sets ``key`` in the database to the transformed parameter.
|
||||
|
||||
.. |atomic-versionstamps-1| replace::
|
||||
A versionstamp is a 10 byte, unique, monotonically (but not sequentially) increasing value for each committed transaction. The first 8 bytes are the committed version of the database. The last 2 bytes are monotonic in the serialization order for transactions.
|
||||
|
||||
.. |atomic-versionstamps-2| replace::
|
||||
A transaction is not permitted to read any transformed key or value previously set within that transaction, and an attempt to do so will result in an error.
|
||||
|
||||
.. |atomic-versionstamps-tuple-warning-key| replace::
|
||||
At this time, versionstamped keys are not compatible with the Tuple layer except in Java and Python. Note that this implies versionstamped keys may not be used with the Subspace and Directory layers except in those languages.
|
||||
|
||||
.. |atomic-versionstamps-tuple-warning-value| replace::
|
||||
At this time, versionstamped values are not compatible with the Tuple layer.
|
||||
|
||||
.. |api-version| replace:: 510
|
||||
|
||||
.. |streaming-mode-blurb1| replace::
|
||||
When using |get-range-func| and similar interfaces, API clients can request large ranges of the database to iterate over. Making such a request doesn't necessarily mean that the client will consume all of the data in the range - sometimes the client doesn't know how far it intends to iterate in advance. FoundationDB tries to balance latency and bandwidth by requesting data for iteration in batches.
|
||||
|
||||
.. |streaming-mode-blurb2| replace::
|
||||
Streaming modes permit the API client to customize this performance tradeoff by providing extra information about how the iterator will be used.
|
||||
|
||||
.. |tuple-layer-blurb| replace::
|
||||
The FoundationDB API comes with a built-in layer for encoding tuples into keys usable by FoundationDB. The encoded key maintains the same sort order as the original tuple: sorted first by the first element, then by the second element, etc. This makes the tuple layer ideal for building a variety of higher-level data models.
|
||||
|
||||
.. |tuple-layer-note| replace::
|
||||
For general guidance on tuple usage, see the discussion in the document on :ref:`Data Modeling <data-modeling-tuples>`.
|
||||
|
||||
.. |transaction-reset-blurb| replace::
|
||||
Rollback a transaction, completely resetting it to its initial state. This is logically equivalent to destroying the transaction and creating a new one.
|
||||
|
||||
.. |transaction-reset-cancel-warning| replace::
|
||||
Be careful if you are using |reset-func| and |cancel-func| concurrently with the same transaction. Since they negate each other's effects, a race condition between these calls will leave the transaction in an unknown state.
|
||||
|
||||
.. |transaction-commit-cancel-warning| replace::
|
||||
If your program attempts to cancel a transaction after |commit-func| has been called but before it returns, unpredictable behavior will result. While it is guaranteed that the transaction will eventually end up in a cancelled state, the commit may or may not occur. Moreover, even if the call to |commit-func| appears to |error-raise-type| a :ref:`transaction_cancelled <developer-guide-error-codes>` |error-type|, the commit may have occurred or may occur in the future. This can make it more difficult to reason about the order in which transactions occur.
|
||||
|
||||
.. |transaction-get-committed-version-blurb| replace::
|
||||
Gets the version number at which a successful commit modified the database. This must be called only after the successful (non-error) completion of a call to |commit-func| on this Transaction, or the behavior is undefined. Read-only transactions do not modify the database when committed and will have a committed version of -1. Keep in mind that a transaction which reads keys and then sets them to their current values may be optimized to a read-only transaction.
|
||||
|
||||
.. |transaction-get-versionstamp-blurb| replace::
|
||||
Returns a future which will contain the versionstamp which was used by any versionstamp operations in this transaction. This function must be called before a call to |commit-func| on this Transaction. The future will be ready only after the successful completion of a call to |commit-func| on this Transaction. Read-only transactions do not modify the database when committed and will result in the future completing with an error. Keep in mind that a transaction which reads keys and then sets them to their current values may be optimized to a read-only transaction.
|
||||
|
||||
.. |transaction-watch-blurb| replace::
|
||||
|
||||
A watch's behavior is relative to the transaction that created it. A watch will report a change in relation to the key's value as readable by that transaction. The initial value used for comparison is either that of the transaction's read version or the value as modified by the transaction itself prior to the creation of the watch. If the value changes and then changes back to its initial value, the watch might not report the change.
|
||||
|
||||
.. |transaction-watch-committed-blurb| replace::
|
||||
|
||||
Until the transaction that created it has been committed, a watch will not report changes made by *other* transactions. In contrast, a watch will immediately report changes made by the transaction itself. Watches cannot be created if the transaction has set |read-your-writes-disable-option|, and an attempt to do so will |error-raise-type| an :ref:`watches_disabled <developer-guide-error-codes>` |error-type|.
|
||||
|
||||
.. |transaction-watch-error-blurb| replace::
|
||||
|
||||
If the transaction used to create a watch encounters an |error-type| during commit, then the watch will be set with that |error-type|. A transaction whose :ref:`commit result is unknown <developer-guide-unknown-results>` will set all of its watches with the :ref:`commit_unknown_result <developer-guide-error-codes>` |error-type|. If an uncommitted transaction is reset or destroyed, then any watches it created will be set with the :ref:`transaction_cancelled <developer-guide-error-codes>` |error-type|.
|
||||
|
||||
.. |transaction-watch-limit-blurb| replace::
|
||||
|
||||
By default, each database connection can have no more than 10,000 watches that have not yet reported a change. When this number is exceeded, an attempt to create a watch will |error-raise-type| a :ref:`too_many_watches <developer-guide-error-codes>` |error-type|. This limit can be changed using |max-watches-database-option|. Because a watch outlives the transaction that creates it, any watch that is no longer needed should be cancelled by calling |future-cancel| on its returned future.
|
||||
|
||||
.. |conflict-range-note| replace::
|
||||
|
||||
Most applications will use the serializable isolation that transactions provide by default and will not need to manipulate conflict ranges.
|
||||
|
||||
.. |conflict-range-blurb| replace::
|
||||
|
||||
The following make it possible to add :ref:`conflict ranges <conflict-ranges>` to a transaction.
|
||||
|
||||
.. |add-read-conflict-range-blurb| replace::
|
||||
|
||||
Adds a range of keys to the transaction's read conflict ranges as if you had read the range. As a result, other transactions that write a key in this range could cause the transaction to fail with a conflict.
|
||||
|
||||
.. |add-read-conflict-key-blurb| replace::
|
||||
|
||||
Adds a key to the transaction's read conflict ranges as if you had read the key. As a result, other transactions that concurrently write this key could cause the transaction to fail with a conflict.
|
||||
|
||||
.. |add-write-conflict-range-blurb| replace::
|
||||
|
||||
Adds a range of keys to the transaction's write conflict ranges as if you had cleared the range. As a result, other transactions that concurrently read a key in this range could fail with a conflict.
|
||||
|
||||
.. |add-write-conflict-key-blurb| replace::
|
||||
|
||||
Adds a key to the transaction's write conflict ranges as if you had written the key. As a result, other transactions that concurrently read this key could fail with a conflict.
|
||||
|
||||
.. |network-options-blurb| replace::
|
||||
|
||||
A singleton providing options which affect the entire FoundationDB client. Note that network options can also be :ref:`set using environment variables<network-options-using-environment-variables>`.
|
||||
|
||||
.. |option-trace-enable-blurb| replace::
|
||||
Enables trace file generation on this FoundationDB client. Trace files will be generated in the specified output directory. If the directory is specified as |null-type|, then the output directory will be the current working directory.
|
||||
|
||||
.. |option-trace-enable-warning| replace::
|
||||
The specified output directory must be unique to this client. In the present release, trace logging does not allow two clients to share a directory.
|
||||
|
||||
.. |option-trace-max-logs-size-blurb| replace::
|
||||
Sets the maximum size in bytes for the sum of this FoundationDB client's trace output files in a single log directory.
|
||||
|
||||
.. |option-trace-roll-size-blurb| replace::
|
||||
Sets the maximum size in bytes of a single trace output file for this FoundationDB client.
|
||||
|
||||
.. |network-options-warning| replace::
|
||||
|
||||
It is an error to set these options after the first call to |open-func| or |init-func| anywhere in your application.
|
||||
|
||||
.. |option-disable-multi-version-client-api| replace::
|
||||
|
||||
Disables the :ref:`multi-version client API <multi-version-client-api>` and instead uses the local client directly. Must be set before setting up the network.
|
||||
|
||||
.. |option-callbacks-on-external-threads| replace::
|
||||
|
||||
If set, callbacks from :ref:`external client libraries <multi-version-client-api>` can be called from threads created by the FoundationDB client library. Otherwise, callbacks will be called from either the thread used to add the callback or the network thread. Setting this option can improve performance when connected using an external client, but may not be safe to use in all environments. Must be set before setting up the network. WARNING: This feature is considered experimental at this time.
|
||||
|
||||
.. |option-external-client-library| replace::
|
||||
|
||||
Adds an external client library for use by the :ref:`multi-version client API <multi-version-client-api>`. Must be set before setting up the network.
|
||||
|
||||
.. |option-external-client-directory| replace::
|
||||
|
||||
Searches the specified path for dynamic libraries and adds them to the list of client libraries for use by the :ref:`multi-version client API <multi-version-client-api>`. Must be set before setting up the network.
|
||||
|
||||
.. |database-options-blurb| replace::
|
||||
|
||||
Database options alter the behavior of FoundationDB databases.
|
||||
|
||||
.. |option-location-cache-size-blurb| replace::
|
||||
Set the size of the client location cache. Raising this value can boost performance in very large databases where clients access data in a near-random pattern. This value must be an integer in the range [0, 2\ :sup:`31`-1]. Defaults to 100000.
|
||||
|
||||
.. |option-max-watches-blurb| replace::
|
||||
|
||||
Set the maximum number of watches allowed to be outstanding on a database connection. Increasing this number could result in increased resource usage. Reducing this number will not cancel any outstanding watches. Defaults to 10000 and cannot be larger than 1000000.
|
||||
|
||||
.. |option-machine-id-blurb| replace::
|
||||
|
||||
Specify the machine ID of a server to be preferentially used for database operations. ID must be a string of up to 16 hexadecimal digits that was used to configure :ref:`fdbserver processes <foundationdb-conf-fdbserver>`. Load balancing uses this option for location-awareness, attempting to send database operations first to servers on a specified machine, then a specified datacenter, then returning to its default algorithm.
|
||||
|
||||
.. |option-datacenter-id-blurb| replace::
|
||||
|
||||
Specify the datacenter ID to be preferentially used for database operations. ID must be a string of up to 16 hexadecimal digits that was used to configure :ref:`fdbserver processes <foundationdb-conf-fdbserver>`. Load balancing uses this option for location-awareness, attempting to send database operations first to servers on a specified machine, then a specified datacenter, then returning to its default algorithm.
|
||||
|
||||
.. |transaction-options-blurb| replace::
|
||||
|
||||
Transaction options alter the behavior of FoundationDB transactions. FoundationDB defaults to extremely safe transaction behavior, and we have worked hard to make the performance excellent with the default setting, so you should not often need to use transaction options.
|
||||
|
||||
.. |option-snapshot-ryw-enable-blurb| replace::
|
||||
|
||||
If this option is set an equal or more times in this transaction than the disable option, snapshot reads *will* see the effects of prior writes in the same transaction.
|
||||
|
||||
.. |option-snapshot-ryw-disable-blurb| replace::
|
||||
|
||||
If this option is set more times in this transction than the enable option, snapshot reads will *not* see the effects of prior writes in the same transaction.
|
||||
|
||||
.. |option-priority-batch-blurb| replace::
|
||||
This transaction should be treated as low priority (other transactions should be processed first). Useful for doing potentially saturating batch work without interfering with the latency of other operations.
|
||||
|
||||
.. |option-priority-system-immediate-blurb| replace::
|
||||
|
||||
This transaction should be treated as extremely high priority, taking priority over other transactions and bypassing controls on transaction queuing.
|
||||
|
||||
.. |option-priority-system-immediate-warning| replace::
|
||||
|
||||
This is intended for the use of internal database functions and low-level tools; use by applications may result in severe database performance or availability problems.
|
||||
|
||||
.. |option-causal-read-risky-blurb| replace::
|
||||
|
||||
This transaction does not require the strict causal consistency guarantee that FoundationDB provides by default. The read version of the transaction will be a committed version, and usually will be the latest committed, but it might be an older version in the event of a fault or network partition.
|
||||
|
||||
.. |option-causal-write-risky-blurb| replace::
|
||||
|
||||
The application either knows that this transaction will be self-conflicting (at least one read overlaps at least one set or clear), or is willing to accept a small risk that the transaction could be committed a second time after its commit apparently succeeds. This option provides a small performance benefit.
|
||||
|
||||
.. |option-read-your-writes-disable-blurb| replace::
|
||||
|
||||
When this option is invoked, a read performed by a transaction will not see any prior mutations that occured in that transaction, instead seeing the value which was in the database at the transaction's read version. This option may provide a small performance benefit for the client, but also disables a number of client-side optimizations which are beneficial for transactions which tend to read and write the same keys within a single transaction.
|
||||
|
||||
.. |option-read-your-writes-disable-note| replace::
|
||||
|
||||
It is an error to set this option after performing any reads or writes on the transaction.
|
||||
|
||||
.. |option-read-ahead-disable-blurb| replace::
|
||||
|
||||
Disables read-ahead caching for range reads. Under normal operation, a transaction will read extra rows from the database into cache if range reads are used to page through a series of data one row at a time (i.e. if a range read with a one row limit is followed by another one row range read starting immediately after the result of the first).
|
||||
|
||||
.. |option-access-system-keys-blurb| replace::
|
||||
|
||||
Allows this transaction to read and modify system keys (those that start with the byte ``0xFF``).
|
||||
|
||||
.. |option-access-system-keys-warning| replace::
|
||||
|
||||
Writing into system keys will likely break your database. Further, even for readers, the format of data in the system keys may change from version to version in FoundationDB.
|
||||
|
||||
.. |option-read-system-keys-blurb| replace::
|
||||
|
||||
Allows this transaction to read system keys (those that start with the byte ``0xFF``).
|
||||
|
||||
.. |option-read-system-keys-warning| replace::
|
||||
|
||||
The format of data in the system keys may change from version to version in FoundationDB.
|
||||
|
||||
.. |option-durability-dev-null-is-web-scale-blurb| replace::
|
||||
|
||||
This option has no effect yet, but may make users migrating from MongoDB more comfortable.
|
||||
|
||||
.. |option-set-retry-limit-blurb1| replace::
|
||||
|
||||
Set a maximum number of retries after which additional calls to |on-error-func| will throw the most recently seen error code. (By default, a transaction permits an unlimited number of retries.) Valid parameter values are [-1, INT_MAX]. If set to -1, the transaction returns to the default of unlimited retries.
|
||||
|
||||
.. |option-set-retry-limit-blurb2| replace::
|
||||
|
||||
Like all transaction options, the retry limit must be reset after a call to |on-error-func|. This behavior allows the user to make the retry limit dynamic.
|
||||
|
||||
.. |option-set-max-retry-delay-blurb| replace::
|
||||
|
||||
Set the maximum backoff delay incurred in the call to |on-error-func| if the error is retryable.
|
||||
|
||||
.. |option-set-timeout-blurb1| replace::
|
||||
|
||||
Set a timeout duration in milliseconds after which the transaction automatically to be cancelled. The time is measured from transaction creation (or the most call to |reset-func-name|, if any). Valid parameter values are [0, INT_MAX]. If set to 0, all timeouts will be disabled. Once a transaction has timed out, all pending or future uses of the transaction will |error-raise-type| a :ref:`transaction_timed_out <developer-guide-error-codes>` |error-type|. The transaction can be used again after it is |reset-func-name|.
|
||||
|
||||
.. |option-set-timeout-blurb2| replace::
|
||||
|
||||
Timeouts employ transaction cancellation, so you should note the issues raised by |cancel-func| when using timeouts.
|
||||
|
||||
.. |option-set-timeout-blurb3| replace::
|
||||
|
||||
Like all transaction options, a timeout must be reset after a call to |on-error-func|. This behavior allows the user to make the timeout dynamic. Note that resetting this option resets only the timeout *duration*, not the starting point from which the time is measured.
|
||||
|
||||
.. |option-next-write-no-write-conflict-range-blurb| replace::
|
||||
|
||||
The next write performed on this transaction will not generate a write conflict range. As a result, other transactions which read the key(s) being modified by the next write will not necessarily conflict with this transaction.
|
||||
|
||||
.. |option-next-write-no-write-conflict-range-note| replace::
|
||||
|
||||
Care needs to be taken when using this option on a transaction that is shared between multiple threads. When setting this option, write conflict ranges will be disabled on the next write operation, regardless of what thread it is on.
|
||||
|
||||
.. |future-blurb1| replace::
|
||||
Many FoundationDB API functions return "future" objects. A brief overview of futures is included in the :doc:`class scheduling tutorial <class-scheduling>`. Most future objects behave just like a normal object, but block when you use them for the first time if the asynchronous function which returned the future has not yet completed its action. A future object is considered ready when either a value is available, or when an error has occurred.
|
||||
|
||||
.. |future-cancel-blurb| replace::
|
||||
|
||||
Cancels |future-type-string| and its associated asynchronous operation. If called before the future is ready, attempts to access its value will |error-raise-type| an :ref:`operation_cancelled <developer-guide-error-codes>` |error-type|. Cancelling a future which is already ready has no effect. Note that even if a future is not ready, its associated asynchronous operation may have succesfully completed and be unable to be cancelled.
|
||||
|
||||
.. |fdb-open-blurb| replace::
|
||||
Initializes the FoundationDB API, connects to the cluster specified by the :ref:`cluster file <foundationdb-cluster-file>`, and opens the database with the specified name. This function is often called without any parameters, using only the defaults. If no cluster file is passed, FoundationDB automatically :ref:`determines a cluster file <specifying-a-cluster-file>` with which to connect to a cluster.
|
||||
|
||||
.. |fdb-transactional-unknown-result-note| replace::
|
||||
In some failure scenarios, it is possible that your transaction will be executed twice. See :ref:`developer-guide-unknown-results` for more information.
|
||||
|
||||
.. |db-attribute-blurb| replace::
|
||||
The |database-class| that this transaction is interacting with.
|
||||
|
||||
.. |database-get-key-caching-blurb| replace::
|
||||
The key is cached, providing a potential performance benefit. However, the
|
||||
value of the key is also retrieved, using network bandwidth.
|
||||
|
||||
.. |transaction-get-key-caching-blurb| replace::
|
||||
By default, the key is cached for the duration of the transaction, providing
|
||||
a potential performance benefit. However, the value of the key is also retrieved,
|
||||
using network bandwidth. Invoking |read-your-writes-disable-option| will avoid
|
||||
both the caching and the increased network bandwidth.
|
||||
|
||||
.. |network-cannot-be-restarted-blurb| replace::
|
||||
Once the network is stopped it cannot be restarted during the lifetime of the running program.
|
||||
|
||||
.. |fdb-careful-with-callbacks-blurb| replace::
|
||||
There are a number of requirements and constraints to be aware of when using callbacks with FoundationDB. Please read :ref:`developer-guide-programming-with-futures`.
|
||||
|
||||
.. |subspace-blurb1| replace::
|
||||
Subspaces provide a convenient way to use the |tuple-layer| to define namespaces for different categories of data. The namespace is specified by a prefix tuple which is prepended to all tuples packed by the subspace. When unpacking a key with the subspace, the prefix tuple will be removed from the result.
|
||||
|
||||
.. |subspace-blurb2| replace::
|
||||
As a best practice, API clients should use at least one subspace for application data.
|
||||
|
||||
.. |subspace-blurb3| replace::
|
||||
Creates a subspace with the specified prefix tuple. If the raw prefix byte string is specified, then it will be prepended to all packed keys. Likewise, the raw prefix will be removed from all unpacked keys.
|
||||
|
||||
.. |subspace-key-blurb| replace::
|
||||
Returns the key encoding the prefix used for the subspace. This is equivalent to packing the empty tuple.
|
||||
|
||||
.. |subspace-pack-blurb| replace::
|
||||
Returns the key encoding the specified tuple in the subspace. For example, if you have a subspace with prefix tuple ``('users')`` and you use it to pack the tuple ``('Smith')``, the result is the same as if you packed the tuple ``('users', 'Smith')`` with the |tuple-layer|.
|
||||
|
||||
.. |subspace-unpack-blurb| replace::
|
||||
Returns the tuple encoded by the given key, with the subspace's prefix tuple and raw prefix removed.
|
||||
|
||||
.. |subspace-range-blurb| replace::
|
||||
Returns a range representing all keys in the subspace that encode tuples strictly starting with the specifed tuple.
|
||||
|
||||
.. |subspace-contains-blurb| replace::
|
||||
Returns true if ``key`` starts with |key-meth|, indicating that the subspace logically contains ``key``.
|
||||
|
||||
.. |subspace-as-foundationdb-key-blurb| replace::
|
||||
Returns the key encoding the prefix used for the subspace, like |key-meth|.
|
||||
|
||||
.. |subspace-subspace-blurb| replace::
|
||||
Returns a new subspace which is equivalent to this subspace with its prefix tuple extended by the specified tuple.
|
||||
|
||||
.. |directory-blurb1| replace::
|
||||
The FoundationDB API provides :ref:`directories <developer-guide-directories>` as a tool for managing related |subspace-api|. Directories are a recommended approach for administering applications. Each application should create or open at least one directory to manage its subspaces.
|
||||
|
||||
.. |directory-blurb2| replace::
|
||||
Directories are identified by hierarchical paths analogous to the paths in a Unix-like file system. A path is represented as |dir-path-type| of strings. Each directory has an associated subspace used to store its content. The directory layer maps each path to a short prefix used for the corresponding subspace. In effect, directories provide a level of indirection for access to subspaces.
|
||||
|
||||
.. |directory-blurb3| replace::
|
||||
Except where noted, directory methods interpret the provided path(s) relative to the path of the directory object. When opening a directory, a byte string ``layer`` option may be specified as a metadata identifier.
|
||||
|
||||
.. |directory-layer-blurb| replace::
|
||||
Each instance defines a new root directory. The subspaces |node-subspace| and |content-subspace| control where the directory metadata and contents, respectively, are stored. The default root directory has a |node-subspace| with raw prefix ``\xFE`` and a |content-subspace| with no prefix. Specifying more restrictive values for |node-subspace| and |content-subspace| will allow using the directory layer alongside other content in a database. If |allow-manual-prefixes| is false, attempts to create a directory with a manual prefix under the directory layer will |error-raise-type| an |error-type|. The default root directory does not allow manual prefixes.
|
||||
|
||||
.. |directory-create-or-open-blurb| replace::
|
||||
Opens the directory with ``path`` specified as |dir-path-type| of strings. ``path`` can also be a string, in which case it will be automatically wrapped in |dir-path-type|. All string values in a path will be converted to unicode. If the directory does not exist, it is created (creating parent directories if necessary).
|
||||
|
||||
.. **NOTE** this blurb is not used in api-node
|
||||
.. |directory-create-or-open-return-blurb| replace::
|
||||
Returns the directory and its contents as a |directory-subspace|.
|
||||
|
||||
.. |directory-open-blurb| replace::
|
||||
Opens the directory with ``path`` specified as |dir-path-type| of strings. ``path`` can also be a string, in which case it will be automatically wrapped in |dir-path-type|. All string values in a path will be converted to unicode. The method will |error-raise-type| an |error-type| if the directory does not exist.
|
||||
|
||||
.. |directory-create-blurb| replace::
|
||||
Creates a directory with ``path`` specified as |dir-path-type| of strings. ``path`` can also be a string, in which case it will be automatically wrapped in |dir-path-type|. All string values in a path will be converted to unicode. Parent directories are created if necessary. The method will |error-raise-type| an |error-type| if the given directory already exists.
|
||||
|
||||
.. |directory-move-blurb| replace::
|
||||
Moves the directory at ``old_path`` to ``new_path``. There is no effect on the physical prefix of the given directory or on clients that already have the directory open. The method will |error-raise-type| an |error-type| if a directory does not exist at ``old_path``, a directory already exists at ``new_path``, or the parent directory of ``new_path`` does not exist.
|
||||
|
||||
.. **NOTE** this blurb is not used in api-node
|
||||
.. |directory-move-return-blurb| replace::
|
||||
Returns the directory at its new location as a |directory-subspace|.
|
||||
|
||||
.. |directory-remove-blurb| replace::
|
||||
Removes the directory at ``path``, its contents, and all subdirectories. The method will |error-raise-type| an |error-type| if the directory does not exist.
|
||||
|
||||
.. |directory-remove-warning| replace::
|
||||
Clients that have already opened the directory might still insert data into its contents after removal.
|
||||
|
||||
.. |directory-remove-if-exists-blurb| replace::
|
||||
Checks if the directory at ``path`` exists and, if so, removes the directory, its contents, and all subdirectories. Returns ``true`` if the directory existed and ``false`` otherwise.
|
||||
|
||||
.. |directory-exists-blurb| replace::
|
||||
Returns ``true`` if the directory at ``path`` exists and ``false`` otherwise.
|
||||
|
||||
.. |directory-get-layer-blurb| replace::
|
||||
Returns the layer specified when the directory was created.
|
||||
|
||||
.. |directory-get-path-blurb| replace::
|
||||
Returns the path with which the directory was opened.
|
||||
|
||||
.. |directory-subspace-blurb| replace::
|
||||
A directory subspace represents a specific directory and its contents. It stores the ``path`` with which it was opened and supports all |directory-layer| methods for operating on itself and its subdirectories. It also implements all |subspace| methods for working with the contents of that directory.
|
||||
|
||||
.. |directory-move-to-blurb| replace::
|
||||
Moves this directory to ``new_path``, interpreting ``new_path`` absolutely. There is no effect on the physical prefix of the given directory or on clients that already have the directory open. The method will |error-raise-type| an |error-type| if a directory already exists at ``new_path`` or the parent directory of ``new_path`` does not exist.
|
||||
|
||||
.. |locality-api-blurb| replace::
|
||||
The FoundationDB API comes with a set of functions for discovering the storage locations of keys within your cluster. This information can be useful for advanced users who wish to take into account the location of keys in the design of applications or processes.
|
||||
|
||||
.. |locality-get-boundary-keys-db-or-tr| replace::
|
||||
The first parameter to this function may be either a |database-class| or a |transaction-class|. If it is passed a |transaction-class|, the transaction will not be committed, reset, or modified in any way, nor will its transaction options (such as retry limit) be applied within the function. However, if the database is unavailable prior to the function call, any timeout set on the transaction will still trigger.
|
||||
|
||||
.. |locality-get-boundary-keys-blurb| replace::
|
||||
|
||||
Returns a |lazy-iterator-object| of keys ``k`` such that ``begin <= k < end`` and ``k`` is located at the start of a contiguous range stored on a single server.
|
||||
|
||||
.. |locality-get-boundary-keys-warning-danger| replace::
|
||||
|
||||
This method is not transactional. It will return an answer no older than the Transaction or Database object it is passed, but the returned boundaries are an estimate and may not represent the exact boundary locations at any database version.
|
||||
|
||||
.. |locality-get-addresses-for-key-blurb| replace::
|
||||
|
||||
Returns a list of public network addresses as strings, one for each of the storage servers responsible for storing ``key`` and its associated value.
|
||||
|
|
@ -0,0 +1,140 @@
|
|||
.. _developer-guide-error-codes:
|
||||
|
||||
Error Codes
|
||||
===========
|
||||
|
||||
FoundationDB may return the following error codes from API functions. If you need to check for specific errors (for example, to implement custom retry logic), you must use the numerical code, since the other fields are particularly likely to change unexpectedly. Error handling logic should also be prepared for new error codes which are not listed here.
|
||||
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| Name | Code| Description |
|
||||
+===============================================+=====+================================================================================+
|
||||
| success | 0| Success |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| operation_failed | 1000| Operation failed |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| timed_out | 1004| Operation timed out |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| transaction_too_old | 1007| Transaction is too old to perform reads or be committed |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| future_version | 1009| Request for future version |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| not_committed | 1020| Transaction not committed due to conflict with another transaction |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| commit_unknown_result | 1021| Transaction may or may not have committed |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| transaction_cancelled | 1025| Operation aborted because the transaction was cancelled |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| transaction_timed_out | 1031| Operation aborted because the transaction timed out |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| too_many_watches | 1032| Too many watches currently set |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| watches_disabled | 1034| Watches cannot be set if read your writes is disabled |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| accessed_unreadable | 1036| Read or wrote an unreadable key |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| database_locked | 1038| Database is locked |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| cluster_version_changed | 1039| Cluster has been upgraded to a new protocol version |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| external_client_already_loaded | 1040| External client has already been loaded |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| operation_cancelled | 1101| Asynchronous operation cancelled |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| future_released | 1102| Future has been released |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| platform_error | 1500| Platform error |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| large_alloc_failed | 1501| Large block allocation failed |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| performance_counter_error | 1502| QueryPerformanceCounter error |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| io_error | 1510| Disk i/o operation failed |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| file_not_found | 1511| File not found |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| bind_failed | 1512| Unable to bind to network |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| file_not_readable | 1513| File could not be read |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| file_not_writable | 1514| File could not be written |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| no_cluster_file_found | 1515| No cluster file found in current directory or default location |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| file_too_large | 1516| File too large to be read |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| client_invalid_operation | 2000| Invalid API call |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| commit_read_incomplete | 2002| Commit with incomplete read |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| test_specification_invalid | 2003| Invalid test specification |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| key_outside_legal_range | 2004| Key outside legal range |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| inverted_range | 2005| Range begin key larger than end key |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| invalid_option_value | 2006| Option set with an invalid value |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| invalid_option | 2007| Option not valid in this context |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| network_not_setup | 2008| Action not possible before the network is configured |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| network_already_setup | 2009| Network can be configured only once |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| read_version_already_set | 2010| Transaction already has a read version set |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| version_invalid | 2011| Version not valid |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| range_limits_invalid | 2012| Range limits not valid |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| invalid_database_name | 2013| Database name must be 'DB' |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| attribute_not_found | 2014| Attribute not found in string |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| future_not_set | 2015| Future not ready |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| future_not_error | 2016| Future not an error |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| used_during_commit | 2017| Operation issued while a commit was outstanding |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| invalid_mutation_type | 2018| Unrecognized atomic mutation type |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| transaction_invalid_version | 2020| Transaction does not have a valid commit version |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| transaction_read_only | 2021| Transaction is read-only and therefore does not have a commit version |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| environment_variable_network_option_failed | 2022| Environment variable network option could not be set |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| transaction_read_only | 2023| Attempted to commit a transaction specified as read-only |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| incompatible_protocol_version | 2100| Incompatible protocol version |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| transaction_too_large | 2101| Transaction exceeds byte limit |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| key_too_large | 2102| Key length exceeds limit |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| value_too_large | 2103| Value length exceeds limit |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| connection_string_invalid | 2104| Connection string invalid |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| address_in_use | 2105| Local address in use |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| invalid_local_address | 2106| Invalid local address |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| tls_error | 2107| TLS error |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| unsupported_operation | 2108| Operation is not supported |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| api_version_unset | 2200| API version is not set |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| api_version_already_set | 2201| API version may be set only once |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| api_version_invalid | 2202| API version not valid |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| api_version_not_supported | 2203| API version not supported |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| exact_mode_without_limits | 2210| EXACT streaming mode requires limits, but none were given |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| unknown_error | 4000| An unknown error occurred |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| internal_error | 4100| An internal error occurred |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
|
@ -0,0 +1,106 @@
|
|||
##########################
|
||||
Using FoundationDB Clients
|
||||
##########################
|
||||
|
||||
.. include:: guide-common.rst.inc
|
||||
|
||||
.. _versioning:
|
||||
|
||||
Versioning
|
||||
==========
|
||||
|
||||
FoundationDB supports a robust versioning system for both its API and binaries. This system allows clusters to be upgraded with minimal changes to both application code and FoundationDB binaries. The API and the FoundationDB binaries are each released in numbered versions. Each version of the binaries has a corresponding API version.
|
||||
|
||||
API versions
|
||||
------------
|
||||
|
||||
In general, a given client will support both its corresponding API version *and earlier API versions*. A client selects the API version it will use by explicitly specifying the version number upon initialization. The purpose of this design is to allow the server, client libraries, or bindings to be upgraded without having to modify application code. You can therefore upgrade to more recent packages (and so receive their various improvements) without having to change application code.
|
||||
|
||||
Binary versions
|
||||
---------------
|
||||
|
||||
By default, client and server binaries installed for a given cluster must have the same version. However, there is also a multi-version feature that allows a client to connect to a cluster whose server processes have a different version. This feature works by loading a separate client library that is version-compatible with the cluster, which then proxies all API calls.
|
||||
|
||||
.. note:: The interaction between API versions and binary versions follows a simple rule: *the API version cannot be set higher than that supported by the client library in use*, including any client library loaded using the multi-version feature.
|
||||
|
||||
For more details, see :ref:`api-python-versioning` and :ref:`multi-version-client-api`.
|
||||
|
||||
Cluster file
|
||||
============
|
||||
|
||||
FoundationDB servers and clients use a cluster file (typically named ``fdb.cluster``) to connect to a cluster. The contents of the cluster file are the same for all processes that connect to the cluster. When connecting to a cluster, the :doc:`client APIs <api-reference>` allow a cluster file to either be explicitly provided or automatically determined by default. For example, using the Python API::
|
||||
|
||||
db = fdb.open('/path/to/specific/file')
|
||||
|
||||
uses only the specified file and errors if it is invalid. In contrast::
|
||||
|
||||
db = fdb.open()
|
||||
|
||||
checks the ``FDB_CLUSTER_FILE`` environment variable, then the current working directory, then the :ref:`default file <default-cluster-file>`. FoundationDB's procedure for determining a cluster file is described in :ref:`Specifying a cluster file <specifying-a-cluster-file>`.
|
||||
|
||||
.. _installing-client-binaries:
|
||||
|
||||
Installing FoundationDB client binaries
|
||||
=======================================
|
||||
|
||||
To access a FoundationDB cluster from a machine that won't need to run the server, you can install just the FoundationDB client binaries (available for download at :doc:`downloads`).
|
||||
|
||||
.. warning:: |upgrade-client-server-warning|
|
||||
|
||||
If you don't already have a FoundationDB cluster to connect to, you should instead follow the instructions in :doc:`getting-started-mac` or :doc:`getting-started-linux`.
|
||||
|
||||
To install on **Ubuntu** use the dpkg command:
|
||||
|
||||
.. parsed-literal::
|
||||
|
||||
user@host$ sudo dpkg -i |package-deb-clients|
|
||||
|
||||
To install on **RHEL/CentOS** use the rpm command:
|
||||
|
||||
.. parsed-literal::
|
||||
|
||||
user@host$ sudo rpm -Uvh |package-rpm-clients|
|
||||
|
||||
To install on **macOS**, run the installer as in :doc:`getting-started-mac`, but deselect the "FoundationDB Server" feature.
|
||||
|
||||
The client binaries include the ``fdbcli`` tool and language bindings for C and Python. Other language bindings must be installed separately.
|
||||
|
||||
Clients will also need a :ref:`cluster file <foundationdb-cluster-file>` to connect to a FoundationDB cluster. You should copy the ``fdb.cluster`` file from the :ref:`default location <default-cluster-file>` on one of your FoundationDB servers to the default location on the client machine.
|
||||
|
||||
.. _multi-version-client-api:
|
||||
|
||||
Multi-version client
|
||||
====================
|
||||
|
||||
The FoundationDB client library supports connecting to clusters with server processes running at a different version than the client. To do so, it must have access to a version of the client compatible with the cluster, which it loads and then proxies API calls through.
|
||||
|
||||
To make use of the multi-version client capabilities, you must set at least one of two network options (``EXTERNAL_CLIENT_LIBRARY`` and/or ``EXTERNAL_CLIENT_DIRECTORY``) that specify the location of these additional client libraries. The client library will start a new network thread for each external client and attempt to make connections to the cluster over all of them (as well as the local library). When making API calls, the library will use the version that is able to communicate with the cluster (choosing an arbitrary one if there are multiple compatible clients), and it will also automatically switch to the appropriate client should the cluster's protocol version change.
|
||||
|
||||
The multi-version client API adds a new ``cluster_version_changed`` error that is used to cancel operations performed by the client when it switches to using a different version of the client library. This error is a retryable one, meaning that all standard retry loops will automatically rerun a transaction that fails with ``cluster_version_changed``.
|
||||
|
||||
.. warning:: Setting an API version that is not supported by a particular client library will prevent that client from being used to connect to the cluster. In particular, you should not advance the API version of your application after upgrading your client until the cluster has also been upgraded.
|
||||
|
||||
.. warning:: You should avoid including multiple protocol-compatible clients in the external client libraries list. While the client will still work, it will consume more resources than necessary. Additionally, different patch releases of the same version (e.g. ``x.y.z`` and ``x.y.w``) are generally protocol compatible, and including multiple may result in not using the most recent compatible client.
|
||||
|
||||
.. note:: It is recommended that you not include more external clients than necessary. For example, a client that has been upgraded to a newer version than its cluster may need to include a single external client that matches the version of the cluster, but it generally won't require a copy of every prior version.
|
||||
|
||||
.. note:: If ``cluster_version_changed`` is thrown during commit, it should be interpreted similarly to ``commit_unknown_result``. The commit may or may not have been completed.
|
||||
|
||||
.. _network-options-using-environment-variables:
|
||||
|
||||
Setting network options with environment variables
|
||||
==================================================
|
||||
|
||||
Client network options can be set automatically upon client startup using specially crafted environment variables. To set a particular network option, add a variable of the following form to your environment::
|
||||
|
||||
FDB_NETWORK_OPTION_<UPPERCASE_OPTION_NAME> = value
|
||||
|
||||
For example, you can enable trace logging for a client by setting the following environment variable::
|
||||
|
||||
FDB_NETWORK_OPTION_TRACE_ENABLE = ""
|
||||
|
||||
If you want to set the same option multiple times (e.g. to add multiple client libraries, for instance), you can separate the values with the default path separator on your system (``:`` on Linux/macOS, ``;`` on Windows). For example::
|
||||
|
||||
FDB_NETWORK_OPTION_EXTERNAL_CLIENT_DIRECTORY = /path/to/dir1:/path/to/dir2:...
|
||||
|
||||
Network options specified using environment variables are set at the end of the call to set the API version. They will be applied in the order of the corresponding option codes assigned to the options. If there is an error reading the appropriate environment variables or if the option cannot be set with the specified value, then the call to set the API version may return an error. In that case, you may attempt to set the API version again, but you must do so with the same version as the attempt that failed.
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,19 @@
|
|||
.. FoundationDB documentation master file, created by
|
||||
sphinx-quickstart on Tue Oct 16 12:48:09 2012.
|
||||
|
||||
API Reference
|
||||
=============
|
||||
|
||||
The following documents give detailed descriptions of the API for each language:
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
:titlesonly:
|
||||
|
||||
api-python
|
||||
api-ruby
|
||||
Node.js API <api-node>
|
||||
Java API <relative://javadoc/index.html>
|
||||
Go API <relative://godoc/fdb.html>
|
||||
api-c
|
||||
api-error-codes
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,9 @@
|
|||
############
|
||||
Architecture
|
||||
############
|
||||
|
||||
FoundationDB makes your architecture flexible and easy to operate. Your applications can send their data directly to the FoundationDB or to a :doc:`layer<layer-concept>`, a user-written module that can provide a new data model, compatibility with existing systems, or even serve as an entire framework. In both cases, all data is stored in a single place via an ordered, transactional key-value API.
|
||||
|
||||
The following diagram details the logical architecture.
|
||||
|
||||
.. image:: /images/Architecture.pdf
|
|
@ -0,0 +1,482 @@
|
|||
.. _backups:
|
||||
|
||||
######################
|
||||
Backup and Restoration
|
||||
######################
|
||||
|
||||
.. include:: guide-common.rst.inc
|
||||
|
||||
This document covers backup and restoration of a FoundationDB database. While FoundationDB itself is fault tolerant, the backup tool provides an additional level of protection by supporting recovery from disasters or unintentional modification of the database.
|
||||
|
||||
.. _backup-introduction:
|
||||
|
||||
Introduction
|
||||
============
|
||||
|
||||
FoundationDB's backup tool makes a consistent, point-in-time backup of a FoundationDB database without downtime. Like FoundationDB itself, the backup/restore software is distributed, with multiple backup agents cooperating to perform a backup or restore faster than a single machine can send or receive data and to continue the backup process seamlessly even when some backup agents fail.
|
||||
|
||||
Since the FoundationDB database usually cannot maintain a consistent snapshot long enough to perform a full backup, the backup tool stores an *inconsistent* copy of the data with a log of database changes that took place during the backup process. The copy and the logs are combined at restore time to reconstruct a consistent, point-in-time snapshot of the database.
|
||||
|
||||
FoundationDB can write backup data to local disks, to a blob store instance, or to another FoundationDB cluster. The location to write a backup to or restore a backup from is called a Backup URL.
|
||||
|
||||
There are 6 command line tools for working with backups:
|
||||
|
||||
``fdbbackup``
|
||||
This command line tool is used to control the backup process. It can ``start`` or ``abort`` a backup, ``discontinue`` a continuous backup, get the ``status`` of an ongoing backup, or ``wait`` for a backup to complete. It controls ``backup_agent`` processes using the database and never does any work itself.
|
||||
|
||||
``fdbrestore``
|
||||
This command line tool is used to control the restore process. It can ``start`` or ``abort`` a restore, get the ``status`` of current and recent restore tasks, or ``wait`` for a restore task to complete while printing ongoing progress details.
|
||||
|
||||
``backup_agent``
|
||||
The backup agent does the actual work of the backup: reading data (and logs) from the database and writing it to the Backup URL. Any number of backup agents pointed at the same database will cooperate to perform the backup. The Backup URL specified for a backup must be accessible by all ``backup_agent`` processes.
|
||||
|
||||
``fdbblob``
|
||||
This tool allows listing, deleting, and getting information on backups that are located in a blob store instance (which have Backup URLs starting with blobstore://).
|
||||
|
||||
``fdbdr``
|
||||
This command line tool is used to control the process of backing up to another FoundationDB cluster. It can ``start``, ``switch``, or ``abort`` a backup. It can also get the ``status`` of an ongoing backup. It controls ``db_agent`` processes using the database and never does any work itself.
|
||||
|
||||
``db_agent``
|
||||
The database backup agent does the actual work of a the backup: reading data (and logs) from the source database and writing it to the destination database. Any number of agents pointed at the same database will cooperate to perform the backup.
|
||||
|
||||
By default, the FoundationDB packages are configured to start a single ``backup_agent`` process on each FoundationDB server. If you want to perform a backup to a network drive or blob store instance that is accessible to every server, you can immediately use the ``fdbbackup start`` command from any machine with access to your cluster to start the backup::
|
||||
|
||||
user@host$ fdbbackup start -d <Backup_URL>
|
||||
|
||||
If instead you want to perform a backup to the local disk of a particular machine or machines which are not network accessible to the FoundationDB servers, then you should disable the backup agents on the FoundationDB servers. This is accomplished by commenting out all of the ``[backup_agent.<ID>]`` sections in :ref:`foundationdb.conf <foundationdb-conf>`. Do not comment out the global ``[backup_agent]`` section. Next, start backup agents on the destination machine or machines. Now, when you start a backup, you can specify the destination directory (as a Backup URL) using a local path on the destination machines. The backup agents will fetch data from the database and store it locally on the destination machines.
|
||||
|
||||
Backup URLs
|
||||
===========
|
||||
|
||||
Backup and Restore locations are specified by Backup URLs. Currently there are two valid Backup URL formats.
|
||||
|
||||
Note that items in angle brackets (< and >) are just placeholders and must be replaced (including the brackets) with meaningful values.
|
||||
|
||||
For local directories, the Backup URL format is::
|
||||
|
||||
file://</absolute/path/to/base_dir>
|
||||
|
||||
An example would be ``file:///home/backups`` which would refer to the directory ``/home/backups``.
|
||||
Note that since paths must be absolute this will result in three slashes (/) in a row in the URL.
|
||||
|
||||
Note that for local directory URLs the actual backup files will not be written to <base_dir> directly but rather to a uniquely timestamped subdirectory. When starting a restore the path to the timestamped subdirectory must be specified.
|
||||
|
||||
For blob store backup locations, the Backup URL format is::
|
||||
|
||||
blobstore://<api_key>:<secret>@<ip>:<port>/<name>[?<param>=<value>[&<param>=<value>]...]
|
||||
|
||||
An example blob store Backup URL would be ``blobstore://myKey:mySecret@2.2.2.2:80/dec_1_2015_0400``.
|
||||
|
||||
Blob store Backup URLs can have optional parameters which set various limits on interactions with the blob store. All values must be positive decimal integers.
|
||||
|
||||
Under normal circumstances the default values should be succificient. The most likely parameter that must be changed is the maximum speed at which to send data to the blob store which is called ``max_send_bytes_per_second`` or ``sbps`` for short.
|
||||
|
||||
Here is a complete list of valid parameters:
|
||||
|
||||
*connect_tries* (or *ct*) - Number of times to try to connect for each request.
|
||||
|
||||
*request_tries* (or *rt*) - Number of times to try each request until a parseable HTTP response other than 429 is received.
|
||||
|
||||
*requests_per_second* (or *rps*) - Max number of requests to start per second.
|
||||
|
||||
*concurrent_requests* (or *cr*) - Max number of requests in progress at once.
|
||||
|
||||
*multipart_max_part_size* (or *maxps*) - Max part size for multipart uploads.
|
||||
|
||||
*multipart_min_part_size* (or *minps*) - Min part size for multipart uploads.
|
||||
|
||||
*concurrent_uploads* (or *cu*) - Max concurrent uploads (part or whole) that can be in progress at once.
|
||||
|
||||
*concurrent_reads_per_file* (or *crps*) - Max concurrent reads in progress for any one file.
|
||||
|
||||
*read_block_size* (or *rbs*) - Block size in bytes to be used for reads.
|
||||
|
||||
*read_ahead_blocks* (or *rab*) - Number of blocks to read ahead of requested offset.
|
||||
|
||||
*read_cache_blocks_per_file* (or *rcb*) - Size of the read cache for a file in blocks.
|
||||
|
||||
*max_send_bytes_per_second* (or *sbps*) - Max send bytes per second for all requests combined.
|
||||
|
||||
*max_recv_bytes_per_second* (or *rbps*) - Max receive bytes per second for all requests combined
|
||||
|
||||
|
||||
``fdbbackup`` command line tool
|
||||
===============================
|
||||
|
||||
.. program:: fdbbackup
|
||||
|
||||
The ``fdbbackup`` command line tool is used to control the backup process. ::
|
||||
|
||||
user@host$ fdbbackup [-h] [-v] [-C <CLUSTER_FILE>] <SUBCOMMAND> <SUBCOMMAND_OPTIONS>
|
||||
|
||||
The following optional arguments must be specified *before* a subcommand:
|
||||
|
||||
.. option:: -h
|
||||
|
||||
Get help on the ``fdbbackup`` command.
|
||||
|
||||
.. option:: -v
|
||||
|
||||
Get the version of FoundationDB in use.
|
||||
|
||||
.. option:: -C <CLUSTER_FILE>
|
||||
|
||||
Specify the path to the ``fdb.cluster`` file that should be used to connect to the FoundationDB cluster you want to back up.
|
||||
|
||||
If not specified, a :ref:`default cluster file <default-cluster-file>` will be used.
|
||||
|
||||
.. _backup-start:
|
||||
|
||||
.. program:: fdbbackup start
|
||||
|
||||
``start``
|
||||
---------
|
||||
|
||||
The ``start`` subcommand is used to start a backup. If there is already a backup in progress, the command will fail and the current backup will be unaffected. Otherwise, a backup is started. If the wait option is used, the command will wait for the backup to complete; otherwise, it returns immediately.
|
||||
|
||||
.. option:: -h
|
||||
|
||||
Get help on the ``fdbbackup start`` subcommand.
|
||||
|
||||
.. option:: -d <Backup_URL>
|
||||
|
||||
| Specify the Backup URL for backup data to be written to. The destination must exist and be writable by every active ``backup_agent`` process connected to the FoundationDB cluster, but need not be accessible to the ``fdbbackup`` tool itself. (By default, the ``backup_agent`` process is running as the ``foundationdb`` user and group on each FoundationDB server.)
|
||||
|
|
||||
| When using file:// Backup URLs, note that the backup files are not written directly to specified directory but rather in a subdirectory of this directory unique for each backup. For example, if the destination URL is ``file:///shared/database-backups/``, the backup agents will create a subdirectory like ``/shared/database-backups/backup-2012-11-01-08-50-22.285173`` in which the actual backup files will be written. Once the backup has completed successfully, all the files in the backup subdirectory together make up the backup. For restoring a backup to succeed, all of these files must be present. The last file written will be ``/shared/database-backups/backup-2012-11-01-08-50-22.285173/restorable``.
|
||||
|
||||
.. option:: -w
|
||||
|
||||
Wait for the backup to complete with behavior identical to that of the :ref:`wait command <backup-wait>`.
|
||||
|
||||
.. option:: -k <KEY_RANGE>+
|
||||
|
||||
| Specify one or more key ranges to be used for the backup. The key ranges are specifed on a single line, separated by spaces. Each key range consists of a single quoted string of the form ``[<BEGIN_KEY> [<END_KEY>]]``. If ``<END_KEY>`` is given, the range extends from ``<BEGIN_KEY>`` (inclusive) to ``<END_KEY>`` (exclusive). If ``<END_KEY>`` is omitted, the range consists of all keys with ``<BEGIN_KEY>`` as a proper prefix.
|
||||
|
|
||||
| Each key range should be quoted in a manner appropriate for your command line environment. Here are some examples for Bash:
|
||||
| ``fdbbackup start -k 'apple bananna' -k 'mango pineapple' <Backup_URL>``
|
||||
| ``fdbbackup start -k '@pp1e b*n*nn*' -k '#an&0 p^n3app!e' <Backup_URL>``
|
||||
|
|
||||
| Here are the equivalent examples for Windows:
|
||||
| ``fdbbackup.exe start -k "apple bananna" -k "mango pineapple" <Backup_URL>``
|
||||
| ``fdbbackup.exe start -k "@pp1e b*n*nn*" -k "#an&0 p^n3app!e" <Backup_URL>``
|
||||
|
||||
.. option:: -z
|
||||
|
||||
Perform the backup continuously rather than terminating when a full backup is complete. All subsequent writes (in the backup's key ranges) will be included in the backup.
|
||||
|
||||
.. warning:: Restoring a backup will take time proportional to the duration of the backup. Therefore, leaving a backup running with -z for an extended period is not recommended. The continuous backup feature should only be used if you periodically discontinue and restart the backup.
|
||||
|
||||
.. program:: fdbbackup abort
|
||||
|
||||
``abort``
|
||||
---------
|
||||
|
||||
The ``abort`` subcommand is used to abort a backup that is currently in progress. If there is no backup in progress, the command will return an error. The destination backup is NOT deleted automatically, but it cannot be used to perform a restore.
|
||||
|
||||
.. option:: -h
|
||||
|
||||
Get help on the ``fdbbackup abort`` subcommand.
|
||||
|
||||
.. warning:: The ``abort`` command will render any running backup unrestorable. To stop a continuous backup gracefully, use ``discontinue``.
|
||||
|
||||
.. program:: fdbbackup discontinue
|
||||
|
||||
``discontinue``
|
||||
---------------
|
||||
|
||||
The ``discontinue`` subcommand is only available for backups that were started with the continuous (``-z``) option. Its effect is to discontinue the continous backup. Note that the subcommand does *not* abort the backup; it simply allows the backup to complete as a noncontinuous backup would.
|
||||
|
||||
.. option:: -h
|
||||
|
||||
Get help on the ``fdbbackup discontinue`` subcommand.
|
||||
|
||||
.. option:: -w
|
||||
|
||||
Wait for the backup to complete with behavior identical to that of the :ref:`wait command <backup-wait>`.
|
||||
|
||||
|
||||
.. _backup-wait:
|
||||
|
||||
.. program:: fdbbackup wait
|
||||
|
||||
``wait``
|
||||
--------
|
||||
|
||||
The ``wait`` subcommand is used to wait for a backup to complete, which is useful for scripting purposes. If there is a backup in progress, it waits for it to complete or be aborted and returns a status based on the result of the backup. If there is no backup in progress, it returns immediately based on the result of the previous backup. The exit code is zero (success) if the backup was completed successfully and nonzero if it was aborted.
|
||||
|
||||
.. option:: -h
|
||||
|
||||
Get help on the ``fdbbackup wait`` subcommand.
|
||||
|
||||
.. program:: fdbbackup status
|
||||
|
||||
``status``
|
||||
----------
|
||||
|
||||
The ``status`` subcommand is used to get information on the current status of a backup. It tells whether there is a backup in progress and backup agents are running. It will also report any errors that have been encountered by backup agents (e.g., errors writing to the output directory).
|
||||
|
||||
::
|
||||
|
||||
user@host$ fdbbackup status -e 1
|
||||
Backup in progress to file:///share/backup_test/out/backup-2012-11-01-10-30-59.270596/.
|
||||
WARNING: Some backup agents have reported errors (printing 1):
|
||||
rebar06(17403) - Error opening /share/backup_test/out/backup-2012-11-01-10-30-59.270596/temp.aff16af7e28046698bc847dc36f3f0f4.part. (IOError: [Errno 2] No such file or directory: '/share/backup_test/out/backup-2012-11-01-10-30-59.270596/temp.aff16af7e28046698bc847dc36f3f0f4.part')
|
||||
|
||||
.. option:: -h
|
||||
|
||||
Get help on the ``fdbbackup status`` subcommand.
|
||||
|
||||
.. option:: -e <LIMIT>
|
||||
|
||||
Print the last (up to) ``<LIMIT>`` errors that were logged into the database by backup agents. The default is 10.
|
||||
|
||||
``fdbrestore`` command line tool
|
||||
================================
|
||||
|
||||
.. program:: fdbrestore
|
||||
|
||||
The ``fdbrestore`` command line tool is used to control restore tasks.
|
||||
|
||||
.. warning:: Restoring a backup will clear the contents of your database within the specified key range to restore, so use this tool with caution!
|
||||
|
||||
.. warning:: It is your responsibility to ensure that no clients are accessing the database while it is being restored. During the restore process the database is in an inconsistent state, and writes that happen during the restore process might be partially or completely overwritten by restored data.
|
||||
|
||||
::
|
||||
|
||||
user@host$ fdbrestore (start | abort | wait | status) [OPTIONS]
|
||||
|
||||
The following options apply to all commands:
|
||||
|
||||
.. option:: -h
|
||||
|
||||
Get help on the ``fdbrestore`` command.
|
||||
|
||||
.. option:: -t <TAG>
|
||||
|
||||
Specify the tag for the restore task. Multiple restore tasks can be in progress at once so long as each task uses a different tag. The default tag is "default".
|
||||
|
||||
.. warning:: If multiple restore tasks are in progress they should be restoring to different prefixes or the result is undefined.
|
||||
|
||||
.. option:: -C <CLUSTER_FILE>
|
||||
|
||||
Specify the path to the ``fdb.cluster`` file that should be used to connect to the FoundationDB cluster you want to use.
|
||||
|
||||
If not specified, a :ref:`default cluster file <default-cluster-file>` will be used.
|
||||
|
||||
.. _restore-start:
|
||||
|
||||
``start``
|
||||
---------
|
||||
|
||||
The ``start`` command will start a new restore on the specified (or default) tag. The command will fail if a tag is already in use by an active restore.
|
||||
|
||||
.. option:: -r <Backup_URL>
|
||||
|
||||
| Specify the Backup URL for the source backup data to restore to the database. The source data must be accessible by the ``backup_agent`` processes for the cluster.
|
||||
|
||||
.. option:: -w
|
||||
|
||||
| Wait for the the restore to reach a final state (such as complete) before exiting. Prints a progress update every few seconds. Behavior is identical to that of the wait command.
|
||||
|
||||
.. option:: -k <KEYS>
|
||||
|
||||
| Specify list of key ranges from the backup to restore to the database
|
||||
|
||||
.. option:: --remove_prefix <PREFIX>
|
||||
|
||||
| remove PREFIX from the keys read from the backup
|
||||
|
||||
.. option:: --add_prefix <PREFIX>
|
||||
|
||||
| prefix to add to restored keys before writing them to the database
|
||||
|
||||
.. option:: -n
|
||||
|
||||
| Perform a trial run without actually restoring any data.
|
||||
|
||||
.. option:: -v <VERSION>
|
||||
|
||||
| Instead of the latest version the backup can be restored to, restore to VERSION.
|
||||
|
||||
.. program:: fdbrestore abort
|
||||
|
||||
``abort``
|
||||
---------
|
||||
|
||||
The ``abort`` command will stop an active backup on the specified (or default) tag. It will display the final state of the restore tag.
|
||||
|
||||
``wait``
|
||||
--------
|
||||
|
||||
The ``wait`` command will wait for the restore on the specified (or default) tag to reach a final state (such as complete or abort) and then exit. While waiting it will prints a progress update every few seconds.
|
||||
|
||||
.. program:: fdbrestore status
|
||||
|
||||
``status``
|
||||
----------
|
||||
|
||||
The ``status`` command will print a detailed status report for either one tag (if a tag is specified) or for all tags.
|
||||
|
||||
``backup_agent`` command line tool
|
||||
==================================
|
||||
|
||||
.. program:: backup_agent
|
||||
|
||||
``backup_agent`` is started automatically on each server in the default configuration of FoundationDB, so you will not normally need to invoke it at the command line. One case in which you would need to do so would be to perform a backup to a destination which is not accessible via a shared filesystem. ::
|
||||
|
||||
user@host$ backup_agent [-h] [-v] [-C <CLUSTER_FILE>]
|
||||
|
||||
.. option:: -h
|
||||
|
||||
Get help on the ``backup_agent`` command.
|
||||
|
||||
.. option:: -v
|
||||
|
||||
Get the version of FoundationDB in use.
|
||||
|
||||
.. option:: -C <CLUSTER_FILE>
|
||||
|
||||
Specify the path to the ``fdb.cluster`` file that should be used to connect to the FoundationDB cluster you want to back up.
|
||||
|
||||
If not specified, a :ref:`default cluster file <default-cluster-file>` will be used.
|
||||
|
||||
``fdbblob`` command line tool
|
||||
=============================
|
||||
|
||||
The ``fdbblob`` command line tool is used to list, delete, and get info on backups stored in a blob store instance.
|
||||
|
||||
To list backups, run::
|
||||
|
||||
fdbblob list <Blobstore_URL>
|
||||
|
||||
where <Blobstore_URL> is simply a blobstore:// Backup URL without a backup name specified. For example, ``blobstore://mykey:mysecret@2.2.2.2:80/``. The output will be a list of blobstore:// Backup URLs which can then be used for ``delete`` or ``info`` operations.
|
||||
|
||||
To delete a backup, run::
|
||||
|
||||
fdbblob delete <Backup_URL>
|
||||
|
||||
.. warning:: If you cancel a delete operation while it is in progress then the specified backup will no longer be usable. Repeat the delete command and allow it to complete to finish removing the backup since it is just wasting space.
|
||||
|
||||
To scan a backup to in order to get its size and object count, run::
|
||||
|
||||
fdbblob info <Backup_URL>
|
||||
|
||||
.. _fdbdr-intro:
|
||||
|
||||
``fdbdr`` command line tool
|
||||
===========================
|
||||
|
||||
.. program:: fdbdr
|
||||
|
||||
The ``fdbdr`` command line tool is used to control the process of backing up to another database. ::
|
||||
|
||||
user@host$ fdbdr [-h] [-v] [-d <CLUSTER_FILE>] [-s <CLUSTER_FILE>] <SUBCOMMAND> <SUBCOMMAND_OPTION>*
|
||||
|
||||
The following optional arguments must be specified *before* a subcommand:
|
||||
|
||||
.. option:: -h
|
||||
|
||||
Get help on the ``fdbdr`` command.
|
||||
|
||||
.. option:: -v
|
||||
|
||||
Get the version of FoundationDB in use.
|
||||
|
||||
.. option:: -d <CLUSTER_FILE>
|
||||
|
||||
Specify the path to the ``fdb.cluster`` file that should be used to connect to the destination FoundationDB cluster you want to back up into.
|
||||
|
||||
.. option:: -s <CLUSTER_FILE>
|
||||
|
||||
Specify the path to the ``fdb.cluster`` file that should be used to connect to the source FoundationDB cluster you want to back up.
|
||||
|
||||
.. _dr-start:
|
||||
|
||||
.. program:: fdbdr start
|
||||
|
||||
``start``
|
||||
---------
|
||||
|
||||
The ``start`` subcommand is used to start a backup. If there is already a backup in progress, the command will fail and the current backup will be unaffected. Otherwise, a backup is started.
|
||||
|
||||
.. option:: -h
|
||||
|
||||
Get help on the ``fdbdr start`` subcommand.
|
||||
|
||||
.. option:: -k <KEY_RANGE>+
|
||||
|
||||
| Specify one or more key ranges to be used for the backup. The key ranges are specifed on a single line, separated by spaces. Each key range consists of a single quoted string of the form ``[<BEGIN_KEY> [<END_KEY>]]``. If ``<END_KEY>`` is given, the range extends from ``<BEGIN_KEY>`` (inclusive) to ``<END_KEY>`` (exclusive). If ``<END_KEY>`` is omitted, the range consists of all keys with ``<BEGIN_KEY>`` as a proper prefix.
|
||||
|
|
||||
| Each key range should be quoted in a manner appropriate for your command line environment. Here are some examples for Bash:
|
||||
| ``fdbdr start -k 'apple bananna' -k 'mango pineapple' <Backup_URL>``
|
||||
| ``fdbdr start -k '@pp1e b*n*nn*' -k '#an&0 p^n3app!e' <Backup_URL>``
|
||||
|
|
||||
| Here are the equivalent examples for Windows:
|
||||
| ``fdbdr.exe start -k "apple bananna" -k "mango pineapple" <Backup_URL>``
|
||||
| ``fdbdr.exe start -k "@pp1e b*n*nn*" -k "#an&0 p^n3app!e" <Backup_URL>``
|
||||
|
||||
.. program:: fdbdr switch
|
||||
|
||||
``switch``
|
||||
----------
|
||||
|
||||
The ``switch`` subcommand is used to switch the source database and the destination database. This means the destination will be unlocked and start streaming data into the source database. This command requires both databases to be available. While the switch command is working, both databases will be locked for a few seconds.
|
||||
|
||||
.. option:: -h
|
||||
|
||||
Get help on the ``fdbdr switch`` subcommand.
|
||||
|
||||
.. program:: fdbdr abort
|
||||
|
||||
``abort``
|
||||
---------
|
||||
|
||||
The ``abort`` subcommand is used to abort a backup that is currently in progress. If there is no backup in progress, the command will return an error. The command will leave the destination database at a consistent snapshot of the source database from sometime in the past.
|
||||
|
||||
.. option:: -h
|
||||
|
||||
Get help on the ``fdbdr abort`` subcommand.
|
||||
|
||||
.. warning:: The ``abort`` command will lose some amount of prior commits.
|
||||
|
||||
|
||||
.. program:: fdbdr status
|
||||
|
||||
``status``
|
||||
----------
|
||||
|
||||
The ``status`` subcommand is used to get information on the current status of a backup. It tells whether there is a backup in progress and backup agents are running. It will also report any errors that have been encountered by backup agents.
|
||||
|
||||
.. option:: -h
|
||||
|
||||
Get help on the ``fdbdr status`` subcommand.
|
||||
|
||||
.. option:: -e <LIMIT>
|
||||
|
||||
Print the last (up to) ``<LIMIT>`` errors that were logged into the database by backup agents. The default is 10.
|
||||
|
||||
``db-agent`` command line tool
|
||||
==============================
|
||||
|
||||
.. program:: db-agent
|
||||
|
||||
unlike ``backup_agent``, ``db-agent`` is not started automatically. A ``db-agent`` needs the cluster files for both the source database and the destination database, and can only perform a backup in one direction (from source to destination). ::
|
||||
|
||||
user@host$ db-agent [-h] [-v] [-d <CLUSTER_FILE>] [-s <CLUSTER_FILE>]
|
||||
|
||||
.. option:: -h
|
||||
|
||||
Get help on the ``db-agent`` command.
|
||||
|
||||
.. option:: -v
|
||||
|
||||
Get the version of FoundationDB in use.
|
||||
|
||||
.. option:: -d <CLUSTER_FILE>
|
||||
|
||||
Specify the path to the ``fdb.cluster`` file that should be used to connect to the destination FoundationDB cluster you want to back up into.
|
||||
|
||||
.. option:: -s <CLUSTER_FILE>
|
||||
|
||||
Specify the path to the ``fdb.cluster`` file that should be used to connect to the source FoundationDB cluster you want to back up.
|
||||
|
|
@ -0,0 +1,151 @@
|
|||
############
|
||||
Benchmarking
|
||||
############
|
||||
|
||||
The goal of this guide is to help you understand how we approach testing FoundationDB using different client concurrencies and cluster sizes.
|
||||
|
||||
Single-core Write Test #1
|
||||
=========================
|
||||
|
||||
FoundationDB can go :doc:`really fast <performance>`, but first let's take a look at how slow we can make it...
|
||||
|
||||
|
||||
Let's set up a minimal FoundationDB database, using just a **single CPU core**. We'll load **1 million rows** with **16-byte keys** and **random 8-100 byte** values and check out how it performs.
|
||||
|
||||
We'll start with writes::
|
||||
|
||||
Loop for 1 minute:
|
||||
Start a transaction
|
||||
Write a random key to a new random value
|
||||
Commit
|
||||
|
||||
Result: 917 writes/second
|
||||
|
||||
If you do this, you'll find that FoundationDB is *quite slow*.
|
||||
|
||||
Why? The biggest reason is that FoundationDB is *ultra-safe* by default. When FoundationDB commits a transaction, it blocks until the transaction is durably logged and fsync'ed to disk. (In this case we are using a **single SATA SSD**.) FoundationDB is optimized for efficient processing of large, sustained workloads at millisecond latencies, rather than these type of synthetic tests that would benefit from ultra-low write latencies.
|
||||
|
||||
Single-core Write Test #2
|
||||
=========================
|
||||
|
||||
So, can it go faster? Well, Test #1 only runs the FoundationDB process at about 3% CPU on a single core, so we might be able to pull something off. Let's try another strategy::
|
||||
|
||||
Loop for 1 minute:
|
||||
Start a transaction
|
||||
Write 10 random keys to new random values
|
||||
Commit
|
||||
|
||||
Result: 8,650 writes/second
|
||||
|
||||
But FoundationDB CPU utilization is still only at 5%! Looks like we can do even more.
|
||||
|
||||
Single-core Write Test #3
|
||||
=========================
|
||||
|
||||
Here's the final strategy for testing writes against FoundationDB running on a single core::
|
||||
|
||||
Start 100 parallel clients, each doing:
|
||||
Loop for 1 minute:
|
||||
Start a transaction
|
||||
Write 10 random keys to new random values
|
||||
Commit
|
||||
|
||||
Result: 46,000 writes/second
|
||||
|
||||
This is 50x faster than our first simple loop! We have achieved this without sacrificing any safety, and still only using a single core. This shows how efficiently FoundationDB handles real-world concurrent workloads—in this case 100 parallel clients. A lot of this can be attributed to the :doc:`Flow programming language <flow>` that we developed specifically for these use cases.
|
||||
|
||||
Single-core Read Test
|
||||
=====================
|
||||
|
||||
Let's take what we learned and test read speed::
|
||||
|
||||
Start 100 parallel clients, each doing:
|
||||
Loop for 1 minute:
|
||||
Start a transaction
|
||||
Read 10 random keys
|
||||
|
||||
Result: 305,000 reads/second
|
||||
|
||||
This test delivers an impressive result while achieving a 0.6 ms average read latency. Again, this is only using one core and doing transactional reads from a fully durable database (albeit without any concurrent writes). Note that a commit will have no impact for a read-only transaction and so is not necessary here.
|
||||
|
||||
Single-core 90/10 Test
|
||||
======================
|
||||
|
||||
Let's put it all together into a test with 90% reads and 10% writes::
|
||||
|
||||
Start 100 parallel clients, each doing:
|
||||
Loop for 1 minute:
|
||||
Start a transaction
|
||||
Read 9 random keys
|
||||
Write 1 random key
|
||||
Commit
|
||||
|
||||
Result: 107,000 operations/second
|
||||
|
||||
This is all done from the **single core database**. Since FoundationDB has a lock-free design, performance remains high even with 100 concurrent clients doing ACID transactions on the same keys.
|
||||
|
||||
Single-core Range Read Test
|
||||
===========================
|
||||
|
||||
Let's check a final performance figure before cranking up the cluster size. FoundationDB is an ordered datastore so let's see how fast we can read concurrent ranges::
|
||||
|
||||
Start 100 parallel clients, each doing:
|
||||
Loop for 1 minute:
|
||||
Start a transaction
|
||||
Perform a random 1000-key range read
|
||||
|
||||
Result: 3,600,000 keys/second
|
||||
|
||||
FoundationDB supports efficient range reads. Ordering keys in your data model to maximize range reads is obviously an important optimization you can use!
|
||||
|
||||
Now it's time to make our FoundationDB cluster a little bigger.
|
||||
|
||||
12-machine Write Test
|
||||
=====================
|
||||
|
||||
We're going to move to using a **modest 12-machine cluster**. Each machine is pretty basic, with a 4-core processor and a single SATA SSD. We'll put a FoundationDB server process on each core, yielding a 48-core cluster. (You could also build something like this with just a couple modern dual-socket machines.) Finally, let's increase the database size to **100,000,000 key-value pairs** and raise the stakes by throwing **3,200 parallel clients (!)** at the cluster. Oh, and let's make it fault-tolerant by enabling **2x replication**.
|
||||
|
||||
We'll start with random writes::
|
||||
|
||||
Start 3,200 parallel clients, each doing:
|
||||
Loop for 1 minute:
|
||||
Start a transaction
|
||||
Write 10 random keys
|
||||
Commit
|
||||
|
||||
Result: 720,000 writes/second
|
||||
|
||||
12-machine Read Test
|
||||
====================
|
||||
|
||||
Now let's turn to reads::
|
||||
|
||||
Start 3,200 parallel clients, each doing:
|
||||
Loop for 1 minute:
|
||||
Start a transaction
|
||||
Read 10 random keys
|
||||
|
||||
Result: 5,540,000 reads/second
|
||||
|
||||
This is exceptional performance for a random-read workload from a transactional database and about half the speed on the same hardware of a dedicated caching layer like memcached. Note that performance is significantly less than the linear extrapolation.
|
||||
|
||||
12-machine 90/10 Test
|
||||
=====================
|
||||
|
||||
Let's put both together into a test with 90% reads and 10% writes::
|
||||
|
||||
Start 3,200 parallel clients, each doing:
|
||||
Loop for 1 minute:
|
||||
Start a transaction
|
||||
Read 9 random keys
|
||||
Write 1 random key
|
||||
Commit
|
||||
|
||||
Result: 2,390,000 operations/second
|
||||
|
||||
Next steps
|
||||
==========
|
||||
|
||||
So how should you go about benchmarking FoundationDB for your own system?
|
||||
|
||||
Begin with the peak throughput your system needs to handle. From here, use the data on our :doc:`performance page <performance>` as a starting point for your cluster configuration and workload design. From our numbers for per-core throughput, you can derive an initial estimate of the number of cores you'll need. Construct a workload that reflects your pattern of reads and writes, making sure to use a large enough number of operations per transaction and/or clients to achieve high concurrency.
|
|
@ -0,0 +1,79 @@
|
|||
####
|
||||
Blob
|
||||
####
|
||||
|
||||
:doc:`Python <blob>` **Java**
|
||||
|
||||
Goal
|
||||
====
|
||||
|
||||
Store binary large objects (blobs) in the database.
|
||||
|
||||
Challenge
|
||||
=========
|
||||
|
||||
A blob is too large to be stored as a single key-value pair in FoundationDB. Values of key-value pairs are limited to 100 kB, and you’ll get the better performance by keeping them closer to 10 kb.
|
||||
|
||||
Explanation
|
||||
===========
|
||||
|
||||
The core of the approach is simple: even though we can't store the blob in a single key-value pair, we can still store it by using multiple key-value pairs. We do this by splitting the blob into chunks and storing the chunks within a single subspace.
|
||||
|
||||
Ordering
|
||||
========
|
||||
|
||||
Chunks are stored in order in adjacent key-value pairs. This approach allows the blob to be read with a single range read.
|
||||
|
||||
Pattern
|
||||
=======
|
||||
|
||||
We create a subspace for a given blob. This subspace will hold the individual chunks of the blob as key-value pairs. In the key, we store a byte-offset into the blob; in the value, we store a range of the bytes starting at that offset. A constant ``CHUNK_SIZE`` establishes a maximum size for chunks of the blob (10 kb is a good starting point).
|
||||
|
||||
A simple transactional function to store a single blob with this strategy would look like:
|
||||
|
||||
.. code-block:: java
|
||||
|
||||
static{
|
||||
blob = new Subspace(Tuple.from("B"));
|
||||
}
|
||||
|
||||
public static void writeBlob(TransactionContext tcx, final String data){
|
||||
if(data.length() == 0) return;
|
||||
tcx.run((Transaction tr) -> {
|
||||
int numChunks = (data.length() + CHUNK_SIZE - 1)/CHUNK_SIZE;
|
||||
int chunkSize = (data.length() + numChunks)/numChunks;
|
||||
|
||||
for(int i = 0; i*chunkSize < data.length(); i++){
|
||||
int start = i*chunkSize;
|
||||
int end = ((i+1)*chunkSize <= data.length()) ? ((i+1)*chunkSize) : (data.length());
|
||||
tr.set(blob.subspace(Tuple.from(start)).pack(),
|
||||
Tuple.from(data.substring(start, end)).pack());
|
||||
}
|
||||
|
||||
return null;
|
||||
});
|
||||
}
|
||||
|
||||
The blob can then be efficiently read with a single range read:
|
||||
|
||||
.. code-block:: java
|
||||
|
||||
public static String readBlob(TransactionContext tcx){
|
||||
return tcx.run((Transaction tr) -> {
|
||||
StringBuilder value = new StringBuilder();
|
||||
for(KeyValue kv : tr.getRange(blob.range())){
|
||||
value.append(Tuple.fromBytes(kv.getValue()).getString(0));
|
||||
}
|
||||
|
||||
return value.toString();
|
||||
});
|
||||
}
|
||||
|
||||
Extensions
|
||||
==========
|
||||
|
||||
*A sparse representation for random reads and writes*
|
||||
|
||||
The simple model above lets us read and write a blob as a whole, but it can be extended to allow random reads and writes, allowing the blob to be partially accessed or streamed. At the same time, the representation can remain sparse, only consuming space for the data that is actually written.
|
||||
|
||||
For efficiency, we sometimes join chunks after a write to avoid the fragmentation than might result from small writes. Joining chunks is controlled by a constant ``CHUNK_SMALL`` (usually around 200 bytes), giving a lower limit for the sum of the size of adjacent chunks.
|
|
@ -0,0 +1,62 @@
|
|||
####
|
||||
Blob
|
||||
####
|
||||
|
||||
**Python** :doc:`Java <blob-java>`
|
||||
|
||||
Goal
|
||||
====
|
||||
|
||||
Store binary large objects (blobs) in the database.
|
||||
|
||||
Challenge
|
||||
=========
|
||||
|
||||
A blob is too large to be stored as a single key-value pair in FoundationDB. Values of key-value pairs are limited to 100 kB, and you’ll get the better performance by keeping them closer to 10 kb.
|
||||
|
||||
Explanation
|
||||
===========
|
||||
|
||||
The core of the approach is simple: even though we can't store the blob in a single key-value pair, we can still store it by using *multiple* key-value pairs. We do this by splitting the blob into chunks and storing the chunks within a single subspace.
|
||||
|
||||
Ordering
|
||||
========
|
||||
|
||||
Chunks are stored in order in adjacent key-value pairs. This approach allows the blob to be read with a single range read.
|
||||
|
||||
Pattern
|
||||
=======
|
||||
|
||||
We create a subspace for a given blob. This subspace will hold the individual chunks of the blob as key-value pairs. In the key, we store a byte-offset into the blob; in the value, we store a range of the bytes starting at that offset. A constant ``CHUNK_SIZE`` establishes a maximum size for chunks of the blob (10 kb is a good starting point).
|
||||
|
||||
A simple transactional function to store a single blob with this strategy would look like::
|
||||
|
||||
CHUNK_SIZE = 10000
|
||||
|
||||
blob_subspace = fdb.Subspace(('myblob',))
|
||||
|
||||
@fdb.transactional
|
||||
def write_blob(tr, blob_data):
|
||||
length = len(blob_data)
|
||||
if not length: return
|
||||
chunks = [(n, n+CHUNK_SIZE) for n in range(0, length, CHUNK_SIZE)]
|
||||
for start, end in chunks:
|
||||
tr[blob_subspace[start]] = blob_data[start:end]
|
||||
|
||||
The blob can then be efficiently read with a single range read::
|
||||
|
||||
@fdb.transactional
|
||||
def read_blob(tr):
|
||||
blob_data = ''
|
||||
for k, v in tr[blob_subspace.range()]:
|
||||
blob_data += v
|
||||
return blob_data
|
||||
|
||||
Extensions
|
||||
==========
|
||||
|
||||
*A sparse representation for random reads and writes*
|
||||
|
||||
The simple model above lets us read and write a blob as a whole, but it can be extended to allow random reads and writes, allowing the blob to be partially accessed or streamed. At the same time, the representation can remain sparse, only consuming space for the data that is actually written.
|
||||
|
||||
For efficiency, we sometimes join chunks after a write to avoid the fragmentation than might result from small writes. Joining chunks is controlled by a constant ``CHUNK_SMALL`` (usually around 200 bytes), giving a lower limit for the sum of the size of adjacent chunks.
|
|
@ -0,0 +1,149 @@
|
|||
.. include:: guide-common.rst.inc
|
||||
|
||||
.. _building-cluster:
|
||||
|
||||
##################
|
||||
Building a Cluster
|
||||
##################
|
||||
|
||||
This guide walks through the steps to build an externally accessible FoundationDB cluster of one or more machines. Before setting up a cluster for performance testing or production use, you should also read the reference material in :doc:`configuration` and :doc:`administration`.
|
||||
|
||||
.. warning:: |development-use-only-warning|
|
||||
|
||||
To build an externally accessible FoundationDB cluster, perform the following steps:
|
||||
|
||||
.. contents::
|
||||
:depth: 1
|
||||
:local:
|
||||
:backlinks: none
|
||||
|
||||
Install FoundationDB
|
||||
====================
|
||||
|
||||
Follow the steps in :doc:`getting-started-linux` to install FoundationDB locally on each of the Linux machines that you wish to use in your cluster.
|
||||
|
||||
.. warning:: When building a cluster, do **not** simply copy the FoundationDB installation, and in particular the **data** files, from one machine to another, whether by direct copying or by cloning a VM.
|
||||
|
||||
Optimize for your hardware
|
||||
==========================
|
||||
|
||||
|optimize-configuration|
|
||||
|
||||
We recommend changing the configuration file once and copying to other machines in the cluster.
|
||||
|
||||
Make FoundationDB externally accessible
|
||||
=======================================
|
||||
|
||||
By default, FoundationDB installs on a single server in a locally accessible mode suitable for development --- only clients on the same machine will be able to access the database. To allow external access, you will have to make your :ref:`cluster file <foundationdb-cluster-file>` public.
|
||||
|
||||
Choose a machine to be the starting machine for your cluster. The database on this machine will be the one that we grow to span the cluster. Use the ``/usr/lib/foundationdb/make_public.py`` script on that server to update your cluster file to use a public interface. For example::
|
||||
|
||||
user@host1$ sudo /usr/lib/foundationdb/make_public.py
|
||||
/etc/foundationdb/fdb.cluster is now using address 10.0.1.1
|
||||
|
||||
By default, the script will pick a local network interface that can access the internet. To specify the address manually, use the ``-a`` flag and choose an address that is accessible by all machines in the cluster as well as by all intended clients.::
|
||||
|
||||
user@host1$ sudo /usr/lib/foundationdb/make_public.py -a 10.0.1.1
|
||||
/etc/foundationdb/fdb.cluster is now using address 10.0.1.1
|
||||
|
||||
.. _test-the-database:
|
||||
|
||||
Test the database
|
||||
=================
|
||||
|
||||
At this point and after each subsequent step, it is a good idea to test the database to make sure it is operating normally. Run ``fdbcli`` on the starting machine::
|
||||
|
||||
user@host1$ fdbcli
|
||||
Using cluster file `/etc/foundationdb/fdb.cluster'.
|
||||
|
||||
The database is available.
|
||||
|
||||
Welcome to the fdbcli. For help, type `help'.
|
||||
fdb> status
|
||||
|
||||
Configuration:
|
||||
Redundancy mode - single
|
||||
Storage engine - ssd
|
||||
Coordinators - 1
|
||||
|
||||
Cluster:
|
||||
FoundationDB processes - 1
|
||||
Machines - 1
|
||||
Memory availability - 4.1 GB per process on machine with least available
|
||||
Fault Tolerance - 0 machines
|
||||
Server time - Wed Oct 8 14:41:34 2014
|
||||
|
||||
Data:
|
||||
Replication health - Healthy
|
||||
Moving data - 0.000 GB
|
||||
Sum of key-value sizes - 0 MB
|
||||
|
||||
Operating space:
|
||||
Storage server - 1.0 GB free on most full server
|
||||
Transaction log - 1.0 GB free on most full server
|
||||
|
||||
Workload:
|
||||
Read rate - 2 Hz
|
||||
Write rate - 0 Hz
|
||||
Transactions started - 2 Hz
|
||||
Transactions committed - 0 Hz
|
||||
Conflict rate - 0 Hz
|
||||
|
||||
Client time: Thu Nov 20 09:50:45 2014
|
||||
|
||||
.. note:: If the database is not operational the status command will provide diagnostic information to help you resolve the issue. For more help, please post a question (and the results of the status command) on the `community forums <https://forums.foundationdb.org>`_.
|
||||
|
||||
Add machines to the cluster
|
||||
===========================
|
||||
|
||||
To add the rest of your machines to the cluster, perform the following steps on each one:
|
||||
|
||||
* Copy the cluster file from a server already in the cluster (located at ``/etc/foundationdb/fdb.cluster``) to the new machine, overwriting the existing ``fdb.cluster`` file.
|
||||
* Restart FoundationDB on the new machine so that it uses the new cluster file::
|
||||
|
||||
user@host2$ sudo service foundationdb restart
|
||||
|
||||
.. _change-redundancy-mode-and-storage-engine:
|
||||
|
||||
Change redundancy mode and storage engine
|
||||
=========================================
|
||||
|
||||
By default, the database will be in ``single`` redundancy mode and use the ``memory`` storage engine. You should change the redundancy mode (see :ref:`configuration-choosing-redundancy-mode`) and storage engine (see :ref:`configuration-storage-engine`) to appropriate values for your cluster.
|
||||
|
||||
For example, to use a triple-replicated database with the SSD storage engine, use the ``configure`` command in the ``fdbcli``::
|
||||
|
||||
user@host1$ fdbcli
|
||||
Using cluster file `/etc/foundationdb/fdb.cluster'.
|
||||
|
||||
The database is available.
|
||||
|
||||
Welcome to the fdbcli. For help, type `help'.
|
||||
fdb> configure triple ssd
|
||||
Configuration changed
|
||||
|
||||
If the configure command hangs or returns an error message, see :ref:`test-the-database`.
|
||||
|
||||
Change coordination servers
|
||||
===========================
|
||||
|
||||
At this point, your cluster will be using the starting machine as the only coordination server, leaving that as a single point of failure. You should therefore select a fault-tolerant set of coordinators according to the criteria in :ref:`configuration-choosing-coordination-servers`. To switch the cluster to your chosen coordinators, run the ``fdbcli`` command on one of the servers and use the ``coordinators`` command to :ref:`set the coordinators <configuration-changing-coordination-servers>`. ::
|
||||
|
||||
user@host$ fdbcli
|
||||
Using cluster file `/etc/foundationdb/fdb.cluster'.
|
||||
|
||||
The database is available.
|
||||
|
||||
Welcome to the fdbcli. For help, type `help'.
|
||||
fdb> coordinators 10.0.4.1:4500 10.0.4.2:4500 10.0.4.3:4500
|
||||
Coordinators changed
|
||||
|
||||
There is also a convenience option, ``coordinators auto``, that will automatically select a set of coordinators based on your redundancy mode.
|
||||
|
||||
.. note:: |coordinators-auto|
|
||||
|
||||
You can also change the cluster ``description``, as decribed in :ref:`configuration-setting-cluster-description`.
|
||||
|
||||
Next steps
|
||||
==========
|
||||
|
||||
To add or remove machines from the cluster or perform other administrative tasks, see :doc:`administration`.
|
|
@ -0,0 +1,71 @@
|
|||
###########
|
||||
CAP Theorem
|
||||
###########
|
||||
|
||||
A database *can* provide strong consistency *and* system availability during network partitions. The common belief that this combination is impossible is based on a misunderstanding of the CAP theorem.
|
||||
|
||||
What is the CAP Theorem?
|
||||
========================
|
||||
|
||||
In 2000, Eric Brewer conjectured that a distributed system cannot simultaneously provide all three of the following desirable properties:
|
||||
|
||||
* Consistency: A read sees all previously completed writes.
|
||||
* Availability: Reads and writes always succeed.
|
||||
* Partition tolerance: Guaranteed properties are maintained even when network failures prevent some machines from communicating with others.
|
||||
|
||||
In 2002, Gilbert and Lynch proved this in the asynchronous and partially synchronous network models, so it is now commonly called the `CAP Theorem <http://en.wikipedia.org/wiki/CAP_theorem>`_.
|
||||
|
||||
Brewer originally described this impossibility result as forcing a choice of "two out of the three" **CAP** properties, leaving three viable design options: **CP**, **AP**, and **CA**. However, further consideration shows that **CA** is not really a coherent option because a system that is not Partition-tolerant will, by definition, be forced to give up Consistency or Availability during a partition. Therefore, a more `modern interpretation <http://dbmsmusings.blogspot.com/2010/04/problems-with-cap-and-yahoos-little.html>`_ of the theorem is: *during a network partition, a distributed system must choose either Consistency or Availability.*
|
||||
|
||||
What does choosing Availability mean?
|
||||
=====================================
|
||||
|
||||
Let's consider an **AP** database. In such a database, reads and writes would always succeed, even when network connectivity is unavailable between nodes. If possible, these would certainly seem like desirable properties!
|
||||
|
||||
However, the downside is stark. Imagine a simple distributed database consisting of two nodes and a network partition making them unable communicate. To be Available, each of the two nodes must continue to accept writes from clients.
|
||||
|
||||
.. figure:: /images/AP_Partition.png
|
||||
|
||||
Data divergence in an AP system during partition
|
||||
|
||||
Of course, because the partition makes communication impossible, a write on one node cannot be seen by the other. Such a system is now "a database" in name only. As long as the partition lasts, the system is fully equivalent to two independent databases whose contents need not even be related, much less consistent.
|
||||
|
||||
Where's the confusion?
|
||||
======================
|
||||
|
||||
Confusion about the CAP theorem usually involves the interpretation of the Availability property. Availability in the **CAP** sense means that *all* nodes remain able to read and write even when partitioned. A system that keeps some, but not all, of its nodes able to read and write is not Available in the **CAP** sense, *even if it remains available to clients* and satisfies its SLAs for `high availability <http://en.wikipedia.org/wiki/High_availability>`_.
|
||||
|
||||
What does FoundationDB choose?
|
||||
==============================
|
||||
|
||||
As any ACID database must, during a network partition FoundationDB chooses Consistency over Availability. This does *not* mean that the database becomes unavailable for clients. When multiple machines or datacenters hosting a FoundationDB database are unable to communicate, *some* of them will be unable to execute writes. In a wide variety of real-world cases, the database and the application using it will remain up. A network partition affecting some machines is no worse than a failure of those same machines, which FoundationDB handles gracefully due to its fault tolerant design.
|
||||
|
||||
FoundationDB fault tolerance
|
||||
============================
|
||||
|
||||
FoundationDB's design goal is to make sure that, even if some machines are down or unable to communicate reliably with the network, the database and the application connected to it remain up. This is high availability as usually understood, but it is *not* Availability in the **CAP** sense because the database will be unavailable *on the affected machines*.
|
||||
|
||||
FoundationDB seeks to present user applications with a single (logical) database. The challenge of handling a partition is to determine which machines should continue to accept reads and writes. To make this determination, FoundationDB is configured with set of *coordination servers*. FoundationDB selects the partition in which a majority of these servers are available as the one that will remain responsive. If failures are so pervasive that there is *no* such partition, then the database really will be unavailable.
|
||||
|
||||
The coordination servers use the `Paxos <http://en.wikipedia.org/wiki/Paxos_(computer_science)>`_ algorithm to maintain a small amount of shared state that itself is Consistent and Partition-tolerant. Like the database as a whole, the shared state is not Available but *is* available for reads and writes in the partition containing a majority of the coordination servers.
|
||||
|
||||
FoundationDB uses this shared state to maintain and update a replication topology. When a failure occurs, the coordination servers are used to change the replication topology. It's worth noting that the coordination servers aren't involved at all in committing transactions.
|
||||
|
||||
Example: a minimal configuration
|
||||
================================
|
||||
|
||||
To illustrate how the coordination servers support fault tolerance, let's look at a FoundationDB cluster of the minimal size that allows for data replication. Of course, the fault tolerance and availability provided by coordination are higher when the cluster is larger.
|
||||
|
||||
Imagine a small web startup that wants its application, served by FoundationDB within a datacenter, to stay available even if a machine fails. It sets up a cluster of three machines - A, B, and C - each running a database server and a coordination server. Applying the majority rule to this cluster, any pair of machines that can communicate will remain available. The startup configures FoundationDB in its ``double`` redundancy mode, in which the system will make two copies of each piece of data, each on a different machine.
|
||||
|
||||
Imagine that a rack-top switch fails, and A is partitioned from the network. A will be unable to commit new transactions because FoundationDB requires an acknowledgment from B or C. The database server on A can only communicate with the coordination server on A, so it will not be able to achieve a majority to set up a new replication topology. For any client communicating only with A, the database is down.
|
||||
|
||||
However, for all other clients, the database servers can reach a majority of coordination servers, B and C. The replication configuration has ensured there is a full copy of the data available even without A. For these clients, the database will remain available for reads and writes and the web servers will continue to serve traffic.
|
||||
|
||||
.. figure:: /images/FDB_Partition.png
|
||||
|
||||
Maintenance of availability during partition
|
||||
|
||||
When the partition ends, A will again be able to communicate with the majority of coordination servers and will rejoin the database. Depending on how long the communications failure lasted, A will rejoin by either receiving transactions that occurred in its absence or, in the worst case, transferring the contents of the database. After A has rejoined the database, all machines will again be able to handle transactions in a fault tolerant manner.
|
||||
|
||||
In contrast to the minimal cluster above, an actual production system would typically be configured in ``triple`` redundancy mode on five or more machines, giving it correspondingly higher availability. For further details, read our discussion of :doc:`fault tolerance <fault-tolerance>`.
|
|
@ -0,0 +1,778 @@
|
|||
######################
|
||||
Class Scheduling in Go
|
||||
######################
|
||||
|
||||
This tutorial provides a walkthrough of designing and building a simple application in Go using FoundationDB. In this tutorial, we use a few simple data modeling techniques. For a more in-depth discussion of data modeling in FoundationDB, see :doc:`data-modeling`.
|
||||
|
||||
The concepts in this tutorial are applicable to all the :doc:`languages <api-reference>` supported by FoundationDB. If you prefer, you can see a version of this tutorial in :doc:`Python <class-scheduling>`, :doc:`Ruby <class-scheduling-ruby>`, or :doc:`Java <class-scheduling-java>`.
|
||||
|
||||
.. _class-sched-go-first-steps:
|
||||
|
||||
First steps
|
||||
===========
|
||||
|
||||
Let's begin with "Hello world."
|
||||
|
||||
If you have not yet installed FoundationDB, see :doc:`getting-started-mac` or :doc:`getting-started-linux`.
|
||||
|
||||
We'll start by importing the basic FoundationDB package, as well as the ``log`` and ``format`` packages:
|
||||
|
||||
.. code-block:: go
|
||||
|
||||
import (
|
||||
"github.com/apple/foundationdb/bindings/go/src/fdb"
|
||||
"log"
|
||||
"fmt"
|
||||
)
|
||||
|
||||
Before using the API, we need to specify the API version. This allows programs to maintain compatibility even if the API is modified in future versions:
|
||||
|
||||
.. code-block:: go
|
||||
|
||||
fdb.MustAPIVersion(510)
|
||||
|
||||
Next, we open a FoundationDB database. The API will connect to the FoundationDB cluster indicated by the :ref:`default cluster file <default-cluster-file>`.
|
||||
|
||||
.. code-block:: go
|
||||
|
||||
db := fdb.MustOpenDefault()
|
||||
|
||||
We're ready to use the database. First, let's write a key-value pair.
|
||||
|
||||
.. code-block:: go
|
||||
|
||||
_, err := db.Transact(func (tr fdb.Transaction) (ret interface{}, e error) {
|
||||
tr.Set(fdb.Key("hello"), []byte("world"))
|
||||
return
|
||||
})
|
||||
if err != nil {
|
||||
log.Fatalf("Unable to set FDB database value (%v)", err)
|
||||
}
|
||||
|
||||
When this function returns without error, the modification is durably stored in FoundationDB! Under the covers, this function creates a transaction with a single modification. We’ll see later how to do multiple operations in a single transaction. For now, let’s read back the data:
|
||||
|
||||
.. code-block:: go
|
||||
|
||||
ret, err := db.Transact(func (tr fdb.Transaction) (ret interface{}, e error) {
|
||||
ret = tr.Get(fdb.Key("hello")).MustGet()
|
||||
return
|
||||
})
|
||||
if err != nil {
|
||||
log.Fatalf("Unable to read FDB database value (%v)", err)
|
||||
}
|
||||
|
||||
v := ret.([]byte)
|
||||
fmt.Printf("hello, %s\n", string(v))
|
||||
|
||||
If this is all working, it looks like we are ready to start building a real application. For reference, here's the full code for "hello world":
|
||||
|
||||
.. code-block:: go
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"github.com/apple/foundationdb/bindings/go/src/fdb"
|
||||
"log"
|
||||
"fmt"
|
||||
)
|
||||
|
||||
func main() {
|
||||
// Different API versions may expose different runtime behaviors.
|
||||
fdb.MustAPIVersion(510)
|
||||
|
||||
// Open the default database from the system cluster
|
||||
db := fdb.MustOpenDefault()
|
||||
|
||||
_, err := db.Transact(func (tr fdb.Transaction) (ret interface{}, e error) {
|
||||
tr.Set(fdb.Key("hello"), []byte("world"))
|
||||
return
|
||||
})
|
||||
if err != nil {
|
||||
log.Fatalf("Unable to set FDB database value (%v)", err)
|
||||
}
|
||||
|
||||
ret, err := db.Transact(func (tr fdb.Transaction) (ret interface{}, e error) {
|
||||
ret = tr.Get(fdb.Key("hello")).MustGet()
|
||||
return
|
||||
})
|
||||
if err != nil {
|
||||
log.Fatalf("Unable to read FDB database value (%v)", err)
|
||||
}
|
||||
|
||||
v := ret.([]byte)
|
||||
fmt.Printf("hello, %s\n", string(v))
|
||||
}
|
||||
|
||||
Class scheduling application
|
||||
============================
|
||||
|
||||
Let's say we've been asked to build a class scheduling system for students and administrators. We'll walk through the design and implementation of this application. Instead of typing everything in as you follow along, look at the :ref:`class-sched-go-appendix` for a finished version of the program. You may want to refer to this code as we walk through the tutorial.
|
||||
|
||||
Requirements
|
||||
------------
|
||||
|
||||
We'll need to let users list available classes and track which students have signed up for which classes. Here's a first cut at the functions we'll need to implement::
|
||||
|
||||
availableClasses() // returns list of classes
|
||||
signup(studentID, class) // signs up a student for a class
|
||||
drop(studentID, class) // drops a student from a class
|
||||
|
||||
.. _class-sched-go-data-model:
|
||||
|
||||
Data model
|
||||
----------
|
||||
|
||||
First, we need to design a :doc:`data model <data-modeling>`. A data model is just a method for storing our application data using keys and values in FoundationDB. We seem to have two main types of data: (1) a list of classes and (2) a record of which students will attend which classes. Let's keep attending data like this::
|
||||
|
||||
// ("attends", student, class) = ""
|
||||
|
||||
We'll just store the key with a blank value to indicate that a student is signed up for a particular class. For this application, we're going to think about a key-value pair's key as a :ref:`tuple <data-modeling-tuples>`. Encoding a tuple of data elements into a key is a very common pattern for an ordered key-value store.
|
||||
|
||||
We'll keep data about classes like this::
|
||||
|
||||
// ("class", class_name) = seatsAvailable
|
||||
|
||||
Similarly, each such key will represent an available class. We'll use ``seatsAvailable`` to record the number of seats available.
|
||||
|
||||
Directories and Subspaces
|
||||
-------------------------
|
||||
|
||||
FoundationDB includes a few modules that make it easy to model data using this approach::
|
||||
|
||||
import (
|
||||
"github.com/apple/foundationdb/bindings/go/src/fdb"
|
||||
"github.com/apple/foundationdb/bindings/go/src/fdb/directory"
|
||||
"github.com/apple/foundationdb/bindings/go/src/fdb/subspace"
|
||||
"github.com/apple/foundationdb/bindings/go/src/fdb/tuple"
|
||||
)
|
||||
|
||||
The :mod:`directory` module lets us open a :ref:`directory <developer-guide-directories>` in the database::
|
||||
|
||||
schedulingDir, err := directory.CreateOrOpen(db, []string{"scheduling"}, nil)
|
||||
if e != nil {
|
||||
log.Fatal(e)
|
||||
}
|
||||
|
||||
The :func:`CreateOrOpen` function returns a :ref:`subspace <developer-guide-sub-keyspaces>` where we'll store our application data. Each subspace has a fixed prefix it uses when defining keys. The prefix corresponds to the first element of a tuple. We decided that we wanted ``"attends"`` and ``"class"`` as our prefixes, so we'll create new subspaces for them within the ``scheduling`` subspace.::
|
||||
|
||||
courseSS = schedulingDir.Sub("class")
|
||||
attendSS = schedulingDir.Sub("attends")
|
||||
|
||||
Subspaces have a :func:`Pack` function for defining keys. To store the records for our data model, we can use ``attendSS.Pack(tuple.Tuple{studentID, class})`` and ``courseSS.Pack(tuple.Tuple{class})``.
|
||||
|
||||
Transactions
|
||||
------------
|
||||
|
||||
We're going to rely on the powerful guarantees of transactions to help keep all of our modifications straight, so let's look at how the FoundationDB Go API lets you write a transactional function. We use :func:`Transact` to execute a code block transactionally. For example, to ``signup`` a ``studentID`` for a ``class``, we might use:
|
||||
|
||||
.. code-block:: go
|
||||
|
||||
func signup(t fdb.Transactor, studentID, class string) (err error) {
|
||||
_, err = t.Transact(func (tr fdb.Transaction) (ret interface{}, err error) {
|
||||
tr.Set(attendSS.Pack(tuple.Tuple{studentID, class}), []byte{})
|
||||
return
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
A function using this approach takes a parameter of type ``Transactor``. When *calling* such a function, you can pass an argument of type ``Database`` or ``Transaction``. The function to be executed transactionally is parameterized by the ``Transaction`` it will use to do reads and writes.
|
||||
|
||||
When using a ``Database``, :func:`Transact` *automatically creates a transaction and implements a retry loop* to ensure that the transaction eventually commits. If you instead pass a ``Transaction``, that transaction will be used directly, and it is assumed that the caller implements appropriate retry logic for errors. This permits functions using this pattern to be composed into larger transactions.
|
||||
|
||||
Without the :func:`Transact` method, signup would look something like:
|
||||
|
||||
.. code-block:: go
|
||||
|
||||
func signup(db fdb.Database, studentID, class string) (err error) {
|
||||
tr, err := d.CreateTransaction()
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
wrapped := func() {
|
||||
defer func() {
|
||||
if r := recover(); r != nil {
|
||||
e, ok := r.(Error)
|
||||
if ok {
|
||||
err = e
|
||||
} else {
|
||||
panic(r)
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
tr.Set(attendSS.Pack(tuple.Tuple{studentID, class}), []byte{})
|
||||
|
||||
err = tr.Commit().Get()
|
||||
}
|
||||
|
||||
for {
|
||||
wrapped()
|
||||
|
||||
if err == nil {
|
||||
return
|
||||
}
|
||||
|
||||
fe, ok := err.(Error)
|
||||
if ok {
|
||||
err = tr.OnError(fe).Get()
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Furthermore, this version can only be called with a ``Database``, making it impossible to compose larger transactional functions by calling one from another.
|
||||
|
||||
Making some sample classes
|
||||
--------------------------
|
||||
|
||||
Let's make some sample classes and put them in the ``classNames`` variable. We'll make individual classes from combinations of class types, levels, and times:
|
||||
|
||||
.. code-block:: go
|
||||
|
||||
var levels = []string{"intro", "for dummies", "remedial", "101", "201", "301", "mastery", "lab", "seminar"}
|
||||
var types = []string{"chem", "bio", "cs", "geometry", "calc", "alg", "film", "music", "art", "dance"}
|
||||
var times = []string{"2:00", "3:00", "4:00", "5:00", "6:00", "7:00", "8:00", "9:00", "10:00", "11:00",
|
||||
"12:00", "13:00", "14:00", "15:00", "16:00", "17:00", "18:00", "19:00"}
|
||||
|
||||
classes := make([]string, len(levels) * len(types) * len(times))
|
||||
|
||||
for i := range levels {
|
||||
for j := range types {
|
||||
for k := range times {
|
||||
classes[i*len(types)*len(times)+j*len(times)+k] = fmt.Sprintf("%s %s %s", levels[i], types[j], times[k])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Initializing the database
|
||||
-------------------------
|
||||
Next, we initialize the database with our class list:
|
||||
|
||||
.. code-block:: go
|
||||
|
||||
_, err = db.Transact(func (tr fdb.Transaction) (interface{}, error) {
|
||||
tr.ClearRange(schedulingDir)
|
||||
|
||||
for i := range classes {
|
||||
tr.Set(courseSS.Pack(tuple.Tuple{classes[i]}), []byte(strconv.FormatInt(100, 10)))
|
||||
}
|
||||
|
||||
return nil, nil
|
||||
})
|
||||
|
||||
After this code is run, the database will contain all of the sample classes we created above.
|
||||
|
||||
Listing available classes
|
||||
-------------------------
|
||||
|
||||
Before students can do anything else, they need to be able to retrieve a list of available classes from the database. Because FoundationDB sorts its data by key and therefore has efficient range-read capability, we can retrieve all of the classes in a single database call. We find this range of keys with :func:`GetRange`:
|
||||
|
||||
.. code-block:: go
|
||||
|
||||
func availableClasses(t fdb.Transactor) (ac []string, err error) {
|
||||
r, err := t.ReadTransact(func (rtr fdb.ReadTransaction) (interface{}, error) {
|
||||
var classes []string
|
||||
ri := rtr.GetRange(courseSS, fdb.RangeOptions{}).Iterator()
|
||||
for ri.Advance() {
|
||||
kv := ri.MustGet()
|
||||
t, err := courseSS.Unpack(kv.Key)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
classes = append(classes, t[0].(string))
|
||||
}
|
||||
return classes, nil
|
||||
})
|
||||
if err == nil {
|
||||
ac = r.([]string)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
The :func:`GetRange` function returns the key-values specified by its range. In this case, we use the subspace ``courseSS`` to get all the classes.
|
||||
|
||||
Signing up for a class
|
||||
----------------------
|
||||
|
||||
We finally get to the crucial function (which we saw before when looking at :func:`Transact`). A student has decided on a class (by name) and wants to sign up. The ``signup`` function will take a ``studentID`` and a ``class``:
|
||||
|
||||
.. code-block:: go
|
||||
|
||||
func signup(t fdb.Transactor, studentID, class string) (err error) {
|
||||
SCKey := attendSS.Pack(tuple.Tuple{studentID, class})
|
||||
|
||||
_, err = t.Transact(func (tr fdb.Transaction) (ret interface{}, err error) {
|
||||
tr.Set(SCKey, []byte{})
|
||||
return
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
We simply insert the appropriate record (with a blank value).
|
||||
|
||||
Dropping a class
|
||||
----------------
|
||||
|
||||
Dropping a class is similar to signing up:
|
||||
|
||||
.. code-block:: go
|
||||
|
||||
func drop(t fdb.Transactor, studentID, class string) (err error) {
|
||||
SCKey := attendSS.Pack(tuple.Tuple{studentID, class})
|
||||
|
||||
_, err = t.Transact(func (tr fdb.Transaction) (ret interface{}, err error) {
|
||||
tr.Clear(SCKey)
|
||||
return
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
Of course, to actually drop the student from the class, we need to be able to delete a record from the database. We do this with the :func:`Clear` function.
|
||||
|
||||
Done?
|
||||
-----
|
||||
|
||||
We report back to the project leader that our application is done---students can sign up for, drop, and list classes. Unfortunately, we learn that there has been a bit of scope creep in the mean time. Popular classes are getting over-subscribed, and our application is going to need to enforce the class size constraint as students add and drop classes.
|
||||
|
||||
Seats are limited!
|
||||
------------------
|
||||
|
||||
Let's go back to the data model. Remember that we stored the number of seats in the class in the value of the key-value entry in the class list. Let's refine that a bit to track the *remaining* number of seats in the class. The initialization can work the same way (in our example, all classes initially have 100 seats), but the ``availableClasses``, ``signup``, and ``drop`` functions are going to have to change. Let's start with ``availableClasses``:
|
||||
|
||||
.. code-block:: go
|
||||
:emphasize-lines: 7-11
|
||||
|
||||
func availableClasses(t fdb.Transactor) (ac []string, err error) {
|
||||
r, err := t.ReadTransact(func (rtr fdb.ReadTransaction) (interface{}, error) {
|
||||
var classes []string
|
||||
ri := rtr.GetRange(courseSS, fdb.RangeOptions{}).Iterator()
|
||||
for ri.Advance() {
|
||||
kv := ri.MustGet()
|
||||
v, err := strconv.ParseInt(string(kv.Value), 10, 64)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if v > 0 {
|
||||
t, err := courseSS.Unpack(kv.Key)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
classes = append(classes, t[0].(string))
|
||||
}
|
||||
}
|
||||
return classes, nil
|
||||
})
|
||||
if err == nil {
|
||||
ac = r.([]string)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
This is easy -- we simply add a condition to check that the value is non-zero. Let's check out ``signup`` next:
|
||||
|
||||
.. code-block:: go
|
||||
:emphasize-lines: 6-19
|
||||
|
||||
func signup(t fdb.Transactor, studentID, class string) (err error) {
|
||||
SCKey := attendSS.Pack(tuple.Tuple{studentID, class})
|
||||
classKey := courseSS.Pack(tuple.Tuple{class})
|
||||
|
||||
_, err = t.Transact(func (tr fdb.Transaction) (ret interface{}, err error) {
|
||||
if tr.Get(SCKey).MustGet() != nil {
|
||||
return // already signed up
|
||||
}
|
||||
|
||||
seats, err := strconv.ParseInt(string(tr.Get(classKey).MustGet()), 10, 64)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
if seats == 0 {
|
||||
err = errors.New("no remaining seats")
|
||||
return
|
||||
}
|
||||
|
||||
tr.Set(classKey, []byte(strconv.FormatInt(seats - 1, 10)))
|
||||
tr.Set(SCKey, []byte{})
|
||||
|
||||
return
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
We now have to check that we aren't already signed up, since we don't want a double sign up to decrease the number of seats twice. Then we look up how many seats are left to make sure there is a seat remaining so we don't push the counter into the negative. If there is a seat remaining, we decrement the counter.
|
||||
|
||||
|
||||
Concurrency and consistency
|
||||
---------------------------
|
||||
|
||||
The ``signup`` function is starting to get a bit complex; it now reads and writes a few different key-value pairs in the database. One of the tricky issues in this situation is what happens as multiple clients/students read and modify the database at the same time. Couldn't two students both see one remaining seat and sign up at the same time?
|
||||
|
||||
These are tricky issues without simple answers---unless you have transactions! Because these functions are defined as FoundationDB transactions, we can have a simple answer: Each transactional function behaves as if it is the only one modifying the database. There is no way for a transaction to 'see' another transaction change the database, and each transaction ensures that either all of its modifications occur or none of them do.
|
||||
|
||||
Looking deeper, it is, of course, possible for two transactions to conflict. For example, if two people both see a class with one seat and sign up at the same time, FoundationDB must allow only one to succeed. This causes one of the transactions to fail to commit (which can also be caused by network outages, crashes, etc.). To ensure correct operation, applications need to handle this situation, usually via retrying the transaction. In this case, the conflicting transaction will be retried automatically by the :func:`Transact` function and will eventually lead to the correct result, a 'No remaining seats' error.
|
||||
|
||||
Idempotence
|
||||
-----------
|
||||
|
||||
Occasionally, a transaction might be retried even after it succeeds (for example, if the client loses contact with the cluster at just the wrong moment). This can cause problems if transactions are not written to be idempotent, i.e. to have the same effect if committed twice as if committed once. There are generic design patterns for :ref:`making any transaction idempotent <developer-guide-unknown-results>`, but many transactions are naturally idempotent. For example, all of the transactions in this tutorial are idempotent.
|
||||
|
||||
Dropping with limited seats
|
||||
---------------------------
|
||||
|
||||
Let's finish up the limited seats feature by modifying the drop function:
|
||||
|
||||
.. code-block:: go
|
||||
:emphasize-lines: 6-15
|
||||
|
||||
func drop(t fdb.Transactor, studentID, class string) (err error) {
|
||||
SCKey := attendSS.Pack(tuple.Tuple{studentID, class})
|
||||
classKey := courseSS.Pack(tuple.Tuple{class})
|
||||
|
||||
_, err = t.Transact(func (tr fdb.Transaction) (ret interface{}, err error) {
|
||||
if tr.Get(SCKey).MustGet() == nil {
|
||||
return // not taking this class
|
||||
}
|
||||
|
||||
seats, err := strconv.ParseInt(string(tr.Get(classKey).MustGet()), 10, 64)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
tr.Set(classKey, []byte(strconv.FormatInt(seats + 1, 10)))
|
||||
tr.Clear(SCKey)
|
||||
|
||||
return
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
This case is easier than signup because there are no constraints we can hit. We just need to make sure the student is in the class and to "give back" one seat when the student drops.
|
||||
|
||||
More features?!
|
||||
---------------
|
||||
|
||||
Of course, as soon as our new version of the system goes live, we hear of a trick that certain students are using. They are signing up for all classes immediately, and only later dropping those that they don't want to take. This has led to an unusable system, and we have been asked to fix it. We decide to limit students to five classes:
|
||||
|
||||
.. code-block:: go
|
||||
:emphasize-lines: 19-23
|
||||
|
||||
func signup(t fdb.Transactor, studentID, class string) (err error) {
|
||||
SCKey := attendSS.Pack(tuple.Tuple{studentID, class})
|
||||
classKey := courseSS.Pack(tuple.Tuple{class})
|
||||
|
||||
_, err = t.Transact(func (tr fdb.Transaction) (ret interface{}, err error) {
|
||||
if tr.Get(SCKey).MustGet() != nil {
|
||||
return // already signed up
|
||||
}
|
||||
|
||||
seats, err := strconv.ParseInt(string(tr.Get(classKey).MustGet()), 10, 64)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
if seats == 0 {
|
||||
err = errors.New("no remaining seats")
|
||||
return
|
||||
}
|
||||
|
||||
classes := tr.GetRange(attendSS.Sub(studentID), fdb.RangeOptions{Mode: fdb.StreamingModeWantAll}).GetSliceOrPanic()
|
||||
if len(classes) == 5 {
|
||||
err = errors.New("too many classes")
|
||||
return
|
||||
}
|
||||
|
||||
tr.Set(classKey, []byte(strconv.FormatInt(seats - 1, 10)))
|
||||
tr.Set(SCKey, []byte{})
|
||||
|
||||
return
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
Fortunately, we decided on a data model that keeps all of the attending records for a single student together. With this approach, we can use a single range read to retrieve all the classes that a student attends. We simply return an error if the number of classes has reached the limit of five.
|
||||
|
||||
Composing transactions
|
||||
----------------------
|
||||
|
||||
Oh, just one last feature, we're told. We have students that are trying to switch from one popular class to another. By the time they drop one class to free up a slot for themselves, the open slot in the other class is gone. By the time they see this and try to re-add their old class, that slot is gone too! So, can we make it so that a student can switch from one class to another without this worry?
|
||||
|
||||
Fortunately, we have FoundationDB, and this sounds an awful lot like the transactional property of atomicity---the all-or-nothing behavior that we already rely on. All we need to do is to *compose* the ``drop`` and ``signup`` functions into a new ``swap`` function. This makes the ``swap`` function exceptionally easy:
|
||||
|
||||
.. code-block:: go
|
||||
|
||||
func swap(t fdb.Transactor, studentID, oldClass, newClass string) (err error) {
|
||||
_, err = t.Transact(func (tr fdb.Transaction) (ret interface{}, err error) {
|
||||
err = drop(tr, studentID, oldClass)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
err = signup(tr, studentID, newClass)
|
||||
return
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
The simplicity of this implementation belies the sophistication of what FoundationDB is taking care of for us.
|
||||
|
||||
By dropping the old class and signing up for the new one inside a single transaction, we ensure that either both steps happen, or that neither happens. The first notable thing about the ``swap`` function is that it is transactional, but it also calls the transactional functions ``signup`` and ``drop``. Because these transactional functions can accept either a ``Database`` or an existing ``Transaction`` as their ``db`` parameter, the ``switchClass`` function can be called with a database by a simple client, and a new transaction will be automatically created. However, once this transaction is created and passed in as ``tr``, the calls to ``drop`` and ``signup`` both share the same ``tr``. This ensures that they see each other's modifications to the database, and all of the changes that both of them make in sequence are made transactionally when the ``switchClass`` function returns. This compositional capability is very powerful.
|
||||
|
||||
Also note that, if an exception is raised, for example, in ``signup``, the exception is not caught by ``swap`` and so will be thrown to the calling function. In this case, the transaction object (owned by the :func:`Transact` function) is destroyed, automatically rolling back all database modifications, leaving the database completely unchanged by the half-executed function.
|
||||
|
||||
Are we done?
|
||||
------------
|
||||
|
||||
Yep, we're done. Fortunately, our UI team built an awesome UI while we were working on our back end, and we are ready to deploy. If you want to see this entire application in one place plus some concurrent testing code, look at the :ref:`class-sched-go-appendix`, below.
|
||||
|
||||
Deploying and scaling
|
||||
---------------------
|
||||
|
||||
Since we store all state for this application in FoundationDB, deploying and scaling this solution up is impressively painless. Just run a web server, the UI, this back end, and point the whole thing at FoundationDB. We can run as many computers with this setup as we want, and they can all hit the database at the same time because of the transactional integrity of FoundationDB. Also, since all of the state in the system is stored in the database, any of these computers can fail without any lasting consequences.
|
||||
|
||||
Next steps
|
||||
==========
|
||||
|
||||
* See :doc:`data-modeling` for guidance on using tuple and subspaces to enable effective storage and retrieval of data.
|
||||
* See :doc:`developer-guide` for general guidance on development using FoundationDB.
|
||||
* See the :doc:`API References <api-reference>` for detailed API documentation.
|
||||
|
||||
.. _class-sched-go-appendix:
|
||||
|
||||
Appendix: classScheduling.go
|
||||
============================
|
||||
|
||||
Here's the code for the scheduling tutorial:
|
||||
|
||||
.. code-block:: go
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"github.com/apple/foundationdb/bindings/go/src/fdb"
|
||||
"github.com/apple/foundationdb/bindings/go/src/fdb/directory"
|
||||
"github.com/apple/foundationdb/bindings/go/src/fdb/subspace"
|
||||
"github.com/apple/foundationdb/bindings/go/src/fdb/tuple"
|
||||
|
||||
"fmt"
|
||||
"log"
|
||||
"strconv"
|
||||
"errors"
|
||||
"sync"
|
||||
"math/rand"
|
||||
)
|
||||
|
||||
var courseSS subspace.Subspace
|
||||
var attendSS subspace.Subspace
|
||||
|
||||
var classes []string
|
||||
|
||||
func availableClasses(t fdb.Transactor) (ac []string, err error) {
|
||||
r, err := t.ReadTransact(func (rtr fdb.ReadTransaction) (interface{}, error) {
|
||||
var classes []string
|
||||
ri := rtr.GetRange(courseSS, fdb.RangeOptions{}).Iterator()
|
||||
for ri.Advance() {
|
||||
kv := ri.MustGet()
|
||||
v, err := strconv.ParseInt(string(kv.Value), 10, 64)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if v > 0 {
|
||||
t, err := courseSS.Unpack(kv.Key)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
classes = append(classes, t[0].(string))
|
||||
}
|
||||
}
|
||||
return classes, nil
|
||||
})
|
||||
if err == nil {
|
||||
ac = r.([]string)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func signup(t fdb.Transactor, studentID, class string) (err error) {
|
||||
SCKey := attendSS.Pack(tuple.Tuple{studentID, class})
|
||||
classKey := courseSS.Pack(tuple.Tuple{class})
|
||||
|
||||
_, err = t.Transact(func (tr fdb.Transaction) (ret interface{}, err error) {
|
||||
if tr.Get(SCKey).MustGet() != nil {
|
||||
return // already signed up
|
||||
}
|
||||
|
||||
seats, err := strconv.ParseInt(string(tr.Get(classKey).MustGet()), 10, 64)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
if seats == 0 {
|
||||
err = errors.New("no remaining seats")
|
||||
return
|
||||
}
|
||||
|
||||
classes := tr.GetRange(attendSS.Sub(studentID), fdb.RangeOptions{Mode: fdb.StreamingModeWantAll}).GetSliceOrPanic()
|
||||
if len(classes) == 5 {
|
||||
err = errors.New("too many classes")
|
||||
return
|
||||
}
|
||||
|
||||
tr.Set(classKey, []byte(strconv.FormatInt(seats - 1, 10)))
|
||||
tr.Set(SCKey, []byte{})
|
||||
|
||||
return
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
func drop(t fdb.Transactor, studentID, class string) (err error) {
|
||||
SCKey := attendSS.Pack(tuple.Tuple{studentID, class})
|
||||
classKey := courseSS.Pack(tuple.Tuple{class})
|
||||
|
||||
_, err = t.Transact(func (tr fdb.Transaction) (ret interface{}, err error) {
|
||||
if tr.Get(SCKey).MustGet() == nil {
|
||||
return // not taking this class
|
||||
}
|
||||
|
||||
seats, err := strconv.ParseInt(string(tr.Get(classKey).MustGet()), 10, 64)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
tr.Set(classKey, []byte(strconv.FormatInt(seats + 1, 10)))
|
||||
tr.Clear(SCKey)
|
||||
|
||||
return
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
func swap(t fdb.Transactor, studentID, oldClass, newClass string) (err error) {
|
||||
_, err = t.Transact(func (tr fdb.Transaction) (ret interface{}, err error) {
|
||||
err = drop(tr, studentID, oldClass)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
err = signup(tr, studentID, newClass)
|
||||
return
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
func main() {
|
||||
fdb.MustAPIVersion(510)
|
||||
db := fdb.MustOpenDefault()
|
||||
|
||||
schedulingDir, err := directory.CreateOrOpen(db, []string{"scheduling"}, nil)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
courseSS = schedulingDir.Sub("class")
|
||||
attendSS = schedulingDir.Sub("attends")
|
||||
|
||||
var levels = []string{"intro", "for dummies", "remedial", "101", "201", "301", "mastery", "lab", "seminar"}
|
||||
var types = []string{"chem", "bio", "cs", "geometry", "calc", "alg", "film", "music", "art", "dance"}
|
||||
var times = []string{"2:00", "3:00", "4:00", "5:00", "6:00", "7:00", "8:00", "9:00", "10:00", "11:00",
|
||||
"12:00", "13:00", "14:00", "15:00", "16:00", "17:00", "18:00", "19:00"}
|
||||
|
||||
classes := make([]string, len(levels) * len(types) * len(times))
|
||||
|
||||
for i := range levels {
|
||||
for j := range types {
|
||||
for k := range times {
|
||||
classes[i*len(types)*len(times)+j*len(times)+k] = fmt.Sprintf("%s %s %s", levels[i], types[j], times[k])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
_, err = db.Transact(func (tr fdb.Transaction) (interface{}, error) {
|
||||
tr.ClearRange(schedulingDir)
|
||||
|
||||
for i := range classes {
|
||||
tr.Set(courseSS.Pack(tuple.Tuple{classes[i]}), []byte(strconv.FormatInt(100, 10)))
|
||||
}
|
||||
|
||||
return nil, nil
|
||||
})
|
||||
|
||||
run(db, 10, 10)
|
||||
}
|
||||
|
||||
func indecisiveStudent(db fdb.Database, id, ops int, wg *sync.WaitGroup) {
|
||||
studentID := fmt.Sprintf("s%d", id)
|
||||
|
||||
allClasses := classes
|
||||
|
||||
var myClasses []string
|
||||
|
||||
for i := 0; i < ops; i++ {
|
||||
var moods []string
|
||||
if len(myClasses) > 0 {
|
||||
moods = append(moods, "drop", "switch")
|
||||
}
|
||||
if len(myClasses) < 5 {
|
||||
moods = append(moods, "add")
|
||||
}
|
||||
|
||||
func() {
|
||||
defer func() {
|
||||
if r := recover(); r != nil {
|
||||
fmt.Println("Need to recheck classes:", r)
|
||||
allClasses = []string{}
|
||||
}
|
||||
}()
|
||||
|
||||
var err error
|
||||
|
||||
if len(allClasses) == 0 {
|
||||
allClasses, err = availableClasses(db)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
|
||||
switch moods[rand.Intn(len(moods))] {
|
||||
case "add":
|
||||
class := allClasses[rand.Intn(len(allClasses))]
|
||||
err = signup(db, studentID, class)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
myClasses = append(myClasses, class)
|
||||
case "drop":
|
||||
classI := rand.Intn(len(myClasses))
|
||||
err = drop(db, studentID, myClasses[classI])
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
myClasses[classI], myClasses = myClasses[len(myClasses)-1], myClasses[:len(myClasses)-1]
|
||||
case "switch":
|
||||
oldClassI := rand.Intn(len(myClasses))
|
||||
newClass := allClasses[rand.Intn(len(allClasses))]
|
||||
err = swap(db, studentID, myClasses[oldClassI], newClass)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
myClasses[oldClassI] = newClass
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
wg.Done()
|
||||
}
|
||||
|
||||
func run(db fdb.Database, students, opsPerStudent int) {
|
||||
var wg sync.WaitGroup
|
||||
|
||||
wg.Add(students)
|
||||
|
||||
for i := 0; i < students; i++ {
|
||||
go indecisiveStudent(db, i, opsPerStudent, &wg)
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
|
||||
fmt.Println("Ran", students * opsPerStudent, "transactions")
|
||||
}
|
|
@ -0,0 +1,617 @@
|
|||
########################
|
||||
Class Scheduling in Java
|
||||
########################
|
||||
|
||||
This tutorial provides a walkthrough of designing and building a simple application in Java using FoundationDB. In this tutorial, we use a few simple data modeling techniques. For a more in-depth discussion of data modeling in FoundationDB, see :doc:`data-modeling`.
|
||||
|
||||
The concepts in this tutorial are applicable to all the :doc:`languages <api-reference>` supported by FoundationDB. If you prefer, you can see a version of this tutorial in :doc:`Python <class-scheduling>`, :doc:`Ruby <class-scheduling-ruby>`, or :doc:`Go <class-scheduling-go>`.
|
||||
|
||||
.. _class-sched-java-first-steps:
|
||||
|
||||
First steps
|
||||
===========
|
||||
|
||||
Let's begin with "Hello world."
|
||||
|
||||
If you have not yet installed FoundationDB, see :doc:`getting-started-mac` or :doc:`getting-started-linux`.
|
||||
|
||||
We'll start by importing the basic FoundationDB package, as well as the :class:`Tuple` class:
|
||||
|
||||
.. code-block:: java
|
||||
|
||||
import com.apple.foundationdb.*;
|
||||
import com.apple.foundationdb.tuple.Tuple;
|
||||
|
||||
Before using the API, we need to specify the API version. This allows programs to maintain compatibility even if the API is modified in future versions. Next, we open a FoundationDB database. The API will connect to the FoundationDB cluster indicated by the :ref:`default cluster file <default-cluster-file>`.
|
||||
|
||||
.. code-block:: java
|
||||
|
||||
private static final FDB fdb;
|
||||
private static final Database db;
|
||||
|
||||
static {
|
||||
fdb = FDB.selectAPIVersion(510);
|
||||
db = fdb.open();
|
||||
}
|
||||
|
||||
We're ready to use the database. First, let's write a key-value pair. We do this by executing a transaction with the :meth:`run` method. We'll also use methods of the :class:`Tuple` class to :meth:`pack` data for storage in the database:
|
||||
|
||||
.. code-block:: java
|
||||
|
||||
db.run((Transaction tr) -> {
|
||||
tr.set(Tuple.from("hello").pack(), Tuple.from("world").pack());
|
||||
return null;
|
||||
});
|
||||
|
||||
When :meth:`run` returns without exception, the modification is durably stored in FoundationDB! This method creates a transaction with a single modification. We'll see later how to do multiple operations in a single transaction. For now, let's read back the data. We'll use :class:`Tuple` again to unpack the ``result`` as a string:
|
||||
|
||||
.. code-block:: java
|
||||
|
||||
String hello = db.run((Transaction tr) -> {
|
||||
byte[] result = tr.get(Tuple.from("hello").pack()).join();
|
||||
return Tuple.fromBytes(result).getString(0);
|
||||
});
|
||||
System.out.println("Hello " + hello);
|
||||
|
||||
If this is all working, it looks like we are ready to start building a real application. For reference, here's the full code for "hello world":
|
||||
|
||||
.. code-block:: java
|
||||
|
||||
import com.apple.foundationdb.*;
|
||||
import com.apple.foundationdb.tuple.Tuple;
|
||||
|
||||
public class HelloWorld {
|
||||
|
||||
private static final FDB fdb;
|
||||
private static final Database db;
|
||||
|
||||
static {
|
||||
fdb = FDB.selectAPIVersion(510);
|
||||
db = fdb.open();
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
// Run an operation on the database
|
||||
db.run((Transaction tr) -> {
|
||||
tr.set(Tuple.from("hello").pack(), Tuple.from("world").pack());
|
||||
return null;
|
||||
});
|
||||
// Get the value of 'hello' from the database
|
||||
String hello = db.run((Transaction tr) -> {
|
||||
byte[] result = tr.get(Tuple.from("hello").pack()).join();
|
||||
return Tuple.fromBytes(result).getString(0);
|
||||
});
|
||||
System.out.println("Hello " + hello);
|
||||
}
|
||||
}
|
||||
|
||||
Class scheduling application
|
||||
============================
|
||||
|
||||
Let's say we've been asked to build a class scheduling system for students and administrators. We'll walk through the design and implementation of this application. Instead of typing everything in as you follow along, look at the :ref:`class-sched-java-appendix` for a finished version of the program. You may want to refer to this code as we walk through the tutorial.
|
||||
|
||||
Requirements
|
||||
------------
|
||||
|
||||
We'll need to let users list available classes and track which students have signed up for which classes. Here's a first cut at the functions we'll need to implement::
|
||||
|
||||
availableClasses() // returns list of classes
|
||||
signup(studentID, class) // signs up a student for a class
|
||||
drop(studentID, class) // drops a student from a class
|
||||
|
||||
.. _class-sched-java-data-model:
|
||||
|
||||
Data model
|
||||
----------
|
||||
|
||||
First, we need to design a :doc:`data model <data-modeling>`. A data model is just a method for storing our application data using keys and values in FoundationDB. We seem to have two main types of data: (1) a list of classes and (2) a record of which students will attend which classes. Let's keep attending data like this::
|
||||
|
||||
// ("attends", student, class) = ""
|
||||
|
||||
We'll just store the key with a blank value to indicate that a student is signed up for a particular class. For this application, we're going to think about a key-value pair's key as a :ref:`tuple <data-modeling-tuples>`. Encoding a tuple of data elements into a key is a very common pattern for an ordered key-value store.
|
||||
|
||||
We'll keep data about classes like this::
|
||||
|
||||
// ("class", class_name) = seatsAvailable
|
||||
|
||||
Similarly, each such key will represent an available class. We'll use ``seatsAvailable`` to record the number of seats available.
|
||||
|
||||
Transactions
|
||||
------------
|
||||
|
||||
We're going to rely on the powerful guarantees of transactions to help keep all of our modifications straight, so let's look at how the FoundationDB Java API lets you write a transactional function. We use the :meth:`run` method to execute a code block transactionally. Let's write the simple ``addClass`` function we'll use to populate the database's class list:
|
||||
|
||||
.. code-block:: java
|
||||
|
||||
private static void addClass(TransactionContext db, final String c) {
|
||||
db.run((Transaction tr) -> {
|
||||
tr.set(Tuple.from("class", c).pack(), encodeInt(100));
|
||||
return null;
|
||||
});
|
||||
}
|
||||
|
||||
A function using this approach takes a :class:`TransactionContext` parameter. When *calling* such a function, you can pass either a :class:`Database` or :class:`Transaction`, each of which are subclasses of :class:`TransactionContext`. The function to be executed transactionally is parameterized by the :class:`Transaction` it will use to do reads and writes.
|
||||
|
||||
The :meth:`run` method *automatically creates a transaction and implements a retry loop* to ensure that the transaction eventually commits.
|
||||
|
||||
For a :class:`database` ``db``::
|
||||
|
||||
addClass(db, "class1")
|
||||
|
||||
is equivalent to something like:
|
||||
|
||||
.. code-block:: java
|
||||
|
||||
Transaction t = db.createTransaction();
|
||||
while (true) {
|
||||
try {
|
||||
tr.set(Tuple.from("class", "class1").pack(), encodeInt(100));
|
||||
t.commit().join();
|
||||
} catch (RuntimeException e) {
|
||||
t = t.onError(e).get();
|
||||
}
|
||||
}
|
||||
|
||||
If instead you pass a :class:`Transaction` for the :class:`TransactionContext` parameter, the transaction will be used directly, and it is assumed that the caller implements appropriate retry logic for errors. This permits functions using this pattern to be composed into larger transactions.
|
||||
|
||||
Making some sample classes
|
||||
--------------------------
|
||||
|
||||
Let's make some sample classes and put them in the ``classNames`` variable. We'll make individual classes from combinations of class types, levels, and times:
|
||||
|
||||
.. code-block:: java
|
||||
|
||||
// Generate 1,620 classes like '9:00 chem for dummies'
|
||||
private static List<String> levels = Arrays.asList("intro", "for dummies",
|
||||
"remedial", "101", "201", "301", "mastery", "lab", "seminar");
|
||||
|
||||
private static List<String> types = Arrays.asList("chem", "bio", "cs",
|
||||
"geometry", "calc", "alg", "film", "music", "art", "dance");
|
||||
|
||||
private static List<String> times = Arrays.asList("2:00", "3:00", "4:00",
|
||||
"5:00", "6:00", "7:00", "8:00", "9:00", "10:00", "11:00", "12:00", "13:00",
|
||||
"14:00", "15:00", "16:00", "17:00", "18:00", "19:00");
|
||||
|
||||
private static List<String> classNames = initClassNames();
|
||||
|
||||
private static List<String> initClassNames() {
|
||||
List<String> classNames = new ArrayList<String>();
|
||||
for (String level: levels)
|
||||
for (String type: types)
|
||||
for (String time: times)
|
||||
classNames.add(time + " " + type + " " + level);
|
||||
return classNames;
|
||||
}
|
||||
|
||||
Initializing the database
|
||||
-------------------------
|
||||
We initialize the database with our class list:
|
||||
|
||||
.. code-block:: java
|
||||
|
||||
private static void init(Database db) {
|
||||
db.run((Transaction tr) -> {
|
||||
tr.clear(Tuple.from("attends").range());
|
||||
tr.clear(Tuple.from("class").range());
|
||||
for (String className: classNames)
|
||||
addClass(tr, className);
|
||||
return null;
|
||||
});
|
||||
}
|
||||
|
||||
After :meth:`init` is run, the database will contain all of the sample classes we created above.
|
||||
|
||||
Listing available classes
|
||||
-------------------------
|
||||
|
||||
Before students can do anything else, they need to be able to retrieve a list of available classes from the database. Because FoundationDB sorts its data by key and therefore has efficient range-read capability, we can retrieve all of the classes in a single database call. We find this range of keys with :meth:`getRange`:
|
||||
|
||||
.. code-block:: java
|
||||
|
||||
private static List<String> availableClasses(TransactionContext db) {
|
||||
return db.run((Transaction tr) -> {
|
||||
List<String> classNames = new ArrayList<String>();
|
||||
for(KeyValue kv: tr.getRange(Tuple.from("class").range()))
|
||||
classNames.add(Tuple.fromBytes(kv.getKey()).getString(1));
|
||||
return classNames;
|
||||
});
|
||||
}
|
||||
|
||||
In general, the :meth:`Tuple.range` method returns a :class:`Range` representing all the key-value pairs starting with the specified tuple. In this case, we want all classes, so we call :meth:`Tuple.range` with the tuple ``("class")``. The :meth:`getRange` method returns an iterable of the key-values specified by its range. To extract the class name, we unpack the key using :meth:`Tuple.fromBytes` and take its second part. (The first part is the prefix ``"class"``.)
|
||||
|
||||
Signing up for a class
|
||||
----------------------
|
||||
|
||||
We finally get to the crucial function. A student has decided on a class (by name) and wants to sign up. The ``signup`` function will take a student (``s``) and a class (``c``):
|
||||
|
||||
.. code-block:: java
|
||||
|
||||
private static void signup(TransactionContext db, final String s, final String c) {
|
||||
db.run((Transaction tr) -> {
|
||||
byte[] rec = Tuple.from("attends", s, c).pack();
|
||||
tr.set(rec, Tuple.from("").pack());
|
||||
return null;
|
||||
});
|
||||
}
|
||||
|
||||
We simply insert the appropriate record (with a blank value).
|
||||
|
||||
Dropping a class
|
||||
----------------
|
||||
|
||||
Dropping a class is similar to signing up:
|
||||
|
||||
.. code-block:: java
|
||||
|
||||
private static void drop(TransactionContext db, final String s, final String c) {
|
||||
db.run((Transaction tr) -> {
|
||||
byte[] rec = Tuple.from("attends", s, c).pack();
|
||||
tr.clear(rec);
|
||||
return null;
|
||||
});
|
||||
}
|
||||
|
||||
Of course, to actually drop the student from the class, we need to be able to delete a record from the database. We do this with the :meth:`clear` method.
|
||||
|
||||
Done?
|
||||
-----
|
||||
|
||||
We report back to the project leader that our application is done---students can sign up for, drop, and list classes. Unfortunately, we learn that there has been a bit of scope creep in the mean time. Popular classes are getting over-subscribed, and our application is going to need to enforce the class size constraint as students add and drop classes.
|
||||
|
||||
Seats are limited!
|
||||
------------------
|
||||
|
||||
Let's go back to the data model. Remember that we stored the number of seats in the class in the value of the key-value entry in the class list. Let's refine that a bit to track the *remaining* number of seats in the class. The initialization can work the same way (in our example, all classes initially have 100 seats), but the ``availableClasses``, ``signup``, and ``drop`` functions are going to have to change. Let's start with ``availableClasses``:
|
||||
|
||||
.. code-block:: java
|
||||
:emphasize-lines: 5
|
||||
|
||||
private static List<String> availableClasses(TransactionContext db) {
|
||||
return db.run((Transaction tr) -> {
|
||||
List<String> classNames = new ArrayList<String>();
|
||||
for(KeyValue kv: tr.getRange(Tuple.from("class").range())) {
|
||||
if (decodeInt(kv.getValue()) > 0)
|
||||
classNames.add(Tuple.fromBytes(kv.getKey()).getString(1));
|
||||
}
|
||||
return classNames;
|
||||
});
|
||||
}
|
||||
|
||||
This is easy -- we simply add a condition to check that the value is non-zero. Let's check out ``signup`` next:
|
||||
|
||||
.. code-block:: java
|
||||
:emphasize-lines: 4-11
|
||||
|
||||
private static void signup(TransactionContext db, final String s, final String c) {
|
||||
db.run((Transaction tr) -> {
|
||||
byte[] rec = Tuple.from("attends", s, c).pack();
|
||||
if (tr.get(rec).get() != null)
|
||||
return null; // already signed up
|
||||
|
||||
int seatsLeft = decodeInt(tr.get(Tuple.from("class", c).pack()).get());
|
||||
if (seatsLeft == 0)
|
||||
throw new IllegalStateException("No remaining seats");
|
||||
|
||||
tr.set(Tuple.from("class", c).pack(), encodeInt(seatsLeft - 1));
|
||||
tr.set(rec, Tuple.from("").pack());
|
||||
return null;
|
||||
});
|
||||
}
|
||||
|
||||
We now have to check that we aren't already signed up, since we don't want a double sign up to decrease the number of seats twice. Then we look up how many seats are left to make sure there is a seat remaining so we don't push the counter into the negative. If there is a seat remaining, we decrement the counter.
|
||||
|
||||
|
||||
Concurrency and consistency
|
||||
---------------------------
|
||||
|
||||
The ``signup`` function is starting to get a bit complex; it now reads and writes a few different key-value pairs in the database. One of the tricky issues in this situation is what happens as multiple clients/students read and modify the database at the same time. Couldn't two students both see one remaining seat and sign up at the same time?
|
||||
|
||||
These are tricky issues without simple answers---unless you have transactions! Because these functions are defined as FoundationDB transactions, we can have a simple answer: Each transactional function behaves as if it is the only one modifying the database. There is no way for a transaction to 'see' another transaction change the database, and each transaction ensures that either all of its modifications occur or none of them do.
|
||||
|
||||
Looking deeper, it is, of course, possible for two transactions to conflict. For example, if two people both see a class with one seat and sign up at the same time, FoundationDB must allow only one to succeed. This causes one of the transactions to fail to commit (which can also be caused by network outages, crashes, etc.). To ensure correct operation, applications need to handle this situation, usually via retrying the transaction. In this case, the conflicting transaction will be retried automatically by the :meth:`run` method and will eventually lead to the correct result, a 'No remaining seats' exception.
|
||||
|
||||
Idempotence
|
||||
-----------
|
||||
|
||||
Occasionally, a transaction might be retried even after it succeeds (for example, if the client loses contact with the cluster at just the wrong moment). This can cause problems if transactions are not written to be idempotent, i.e. to have the same effect if committed twice as if committed once. There are generic design patterns for :ref:`making any transaction idempotent <developer-guide-unknown-results>`, but many transactions are naturally idempotent. For example, all of the transactions in this tutorial are idempotent.
|
||||
|
||||
Dropping with limited seats
|
||||
---------------------------
|
||||
|
||||
Let's finish up the limited seats feature by modifying the drop function:
|
||||
|
||||
.. code-block:: java
|
||||
:emphasize-lines: 4-7
|
||||
|
||||
private static void drop(TransactionContext db, final String s, final String c) {
|
||||
db.run((Transaction tr) -> {
|
||||
byte[] rec = Tuple.from("attends", s, c).pack();
|
||||
if (tr.get(rec).join() == null)
|
||||
return null; // not taking this class
|
||||
byte[] classKey = Tuple.from("class", c).pack();
|
||||
tr.set(classKey, encodeInt(decodeInt(tr.get(classKey).join()) + 1));
|
||||
tr.clear(rec);
|
||||
return null;
|
||||
});
|
||||
}
|
||||
|
||||
This case is easier than signup because there are no constraints we can hit. We just need to make sure the student is in the class and to "give back" one seat when the student drops.
|
||||
|
||||
More features?!
|
||||
---------------
|
||||
|
||||
Of course, as soon as our new version of the system goes live, we hear of a trick that certain students are using. They are signing up for all classes immediately, and only later dropping those that they don't want to take. This has led to an unusable system, and we have been asked to fix it. We decide to limit students to five classes:
|
||||
|
||||
.. code-block:: java
|
||||
:emphasize-lines: 11-13
|
||||
|
||||
private static void signup(TransactionContext db, final String s, final String c) {
|
||||
db.run((Transaction tr) -> {
|
||||
byte[] rec = Tuple.from("attends", s, c).pack();
|
||||
if (tr.get(rec).join() != null)
|
||||
return null; // already signed up
|
||||
|
||||
int seatsLeft = decodeInt(tr.get(Tuple.from("class", c).pack()).join());
|
||||
if (seatsLeft == 0)
|
||||
throw new IllegalStateException("No remaining seats");
|
||||
|
||||
List<KeyValue> classes = tr.getRange(Tuple.from("attends", s).range()).asList().join();
|
||||
if (classes.size() == 5)
|
||||
throw new IllegalStateException("Too many classes");
|
||||
|
||||
tr.set(Tuple.from("class", c).pack(), encodeInt(seatsLeft - 1));
|
||||
tr.set(rec, Tuple.from("").pack());
|
||||
return null;
|
||||
});
|
||||
}
|
||||
|
||||
Fortunately, we decided on a data model that keeps all of the attending records for a single student together. With this approach, we can use a single range read to retrieve all the classes that a student attends. We simply throw an exception if the number of classes has reached the limit of five.
|
||||
|
||||
Composing transactions
|
||||
----------------------
|
||||
|
||||
Oh, just one last feature, we're told. We have students that are trying to switch from one popular class to another. By the time they drop one class to free up a slot for themselves, the open slot in the other class is gone. By the time they see this and try to re-add their old class, that slot is gone too! So, can we make it so that a student can switch from one class to another without this worry?
|
||||
|
||||
Fortunately, we have FoundationDB, and this sounds an awful lot like the transactional property of atomicity---the all-or-nothing behavior that we already rely on. All we need to do is to *compose* the ``drop`` and ``signup`` functions into a new ``switchClasses`` function. This makes the ``switchClasses`` function exceptionally easy:
|
||||
|
||||
.. code-block:: java
|
||||
|
||||
private static void switchClasses(TransactionContext db, final String s, final String oldC, final String newC) {
|
||||
db.run((Transaction tr) -> {
|
||||
drop(tr, s, oldC);
|
||||
signup(tr, s, newC);
|
||||
return null;
|
||||
});
|
||||
}
|
||||
|
||||
The simplicity of this implementation belies the sophistication of what FoundationDB is taking care of for us.
|
||||
|
||||
By dropping the old class and signing up for the new one inside a single transaction, we ensure that either both steps happen, or that neither happens. The first notable thing about the ``switchClasses`` function is that it is transactional, but it also calls the transactional functions ``signup`` and ``drop``. Because these transactional functions can accept either a database or an existing transaction as their ``db`` parameter, the ``switchClass`` function can be called with a database by a simple client, and a new transaction will be automatically created. However, once this transaction is created and passed in as ``tr``, the calls to ``drop`` and ``signup`` both share the same ``tr``. This ensures that they see each other's modifications to the database, and all of the changes that both of them make in sequence are made transactionally when the ``switchClass`` function returns. This compositional capability is very powerful.
|
||||
|
||||
Also note that, if an exception is raised, for example, in ``signup``, the exception is not caught by ``switchClasses`` and so will be thrown to the calling function. In this case, the transaction object (owned by the :meth:`run` method) is destroyed, automatically rolling back all database modifications, leaving the database completely unchanged by the half-executed function.
|
||||
|
||||
Are we done?
|
||||
------------
|
||||
|
||||
Yep, we're done. Fortunately, our UI team built an awesome UI while we were working on our back end, and we are ready to deploy. If you want to see this entire application in one place plus some multithreaded testing code to simulate concurrency, look at the :ref:`class-sched-java-appendix`, below.
|
||||
|
||||
Deploying and scaling
|
||||
---------------------
|
||||
|
||||
Since we store all state for this application in FoundationDB, deploying and scaling this solution up is impressively painless. Just run a web server, the UI, this back end, and point the whole thing at FoundationDB. We can run as many computers with this setup as we want, and they can all hit the database at the same time because of the transactional integrity of FoundationDB. Also, since all of the state in the system is stored in the database, any of these computers can fail without any lasting consequences.
|
||||
|
||||
Next steps
|
||||
==========
|
||||
|
||||
* See :doc:`data-modeling` for guidance on using tuple and subspaces to enable effective storage and retrieval of data.
|
||||
* See :doc:`developer-guide` for general guidance on development using FoundationDB.
|
||||
* See the :doc:`API References <api-reference>` for detailed API documentation.
|
||||
|
||||
.. _class-sched-java-appendix:
|
||||
|
||||
Appendix: ClassScheduling.java
|
||||
===============================
|
||||
|
||||
Here's the code for the scheduling tutorial:
|
||||
|
||||
.. code-block:: java
|
||||
|
||||
import java.nio.ByteBuffer;
|
||||
import java.util.Arrays;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Random;
|
||||
|
||||
import com.apple.foundationdb.*;
|
||||
import com.apple.foundationdb.tuple.Tuple;
|
||||
|
||||
|
||||
// Data model:
|
||||
// ("attends", student, class) = ""
|
||||
// ("class", class_name) = seatsLeft
|
||||
|
||||
public class ClassScheduling {
|
||||
|
||||
private static final FDB fdb;
|
||||
private static final Database db;
|
||||
|
||||
static {
|
||||
fdb = FDB.selectAPIVersion(510);
|
||||
db = fdb.open();
|
||||
}
|
||||
|
||||
// Generate 1,620 classes like '9:00 chem for dummies'
|
||||
private static List<String> levels = Arrays.asList("intro", "for dummies",
|
||||
"remedial", "101", "201", "301", "mastery", "lab", "seminar");
|
||||
|
||||
private static List<String> types = Arrays.asList("chem", "bio", "cs",
|
||||
"geometry", "calc", "alg", "film", "music", "art", "dance");
|
||||
|
||||
private static List<String> times = Arrays.asList("2:00", "3:00", "4:00",
|
||||
"5:00", "6:00", "7:00", "8:00", "9:00", "10:00", "11:00", "12:00", "13:00",
|
||||
"14:00", "15:00", "16:00", "17:00", "18:00", "19:00");
|
||||
|
||||
private static List<String> classNames = initClassNames();
|
||||
|
||||
private static List<String> initClassNames() {
|
||||
List<String> classNames = new ArrayList<String>();
|
||||
for (String level: levels)
|
||||
for (String type: types)
|
||||
for (String time: times)
|
||||
classNames.add(time + " " + type + " " + level);
|
||||
return classNames;
|
||||
}
|
||||
|
||||
private static void addClass(TransactionContext db, final String c) {
|
||||
db.run((Transaction tr) -> {
|
||||
tr.set(Tuple.from("class", c).pack(), encodeInt(100));
|
||||
return null;
|
||||
});
|
||||
}
|
||||
|
||||
private static byte[] encodeInt(int value) {
|
||||
byte[] output = new byte[4];
|
||||
ByteBuffer.wrap(output).putInt(value);
|
||||
return output;
|
||||
}
|
||||
|
||||
private static int decodeInt(byte[] value) {
|
||||
if (value.length != 4)
|
||||
throw new IllegalArgumentException("Array must be of size 4");
|
||||
return ByteBuffer.wrap(value).getInt();
|
||||
}
|
||||
|
||||
private static void init(Database db) {
|
||||
db.run((Transaction tr) -> {
|
||||
tr.clear(Tuple.from("attends").range());
|
||||
tr.clear(Tuple.from("class").range());
|
||||
for (String className: classNames)
|
||||
addClass(tr, className);
|
||||
return null;
|
||||
});
|
||||
}
|
||||
|
||||
private static List<String> availableClasses(TransactionContext db) {
|
||||
return db.run((Transaction tr) -> {
|
||||
List<String> classNames = new ArrayList<String>();
|
||||
for(KeyValue kv: tr.getRange(Tuple.from("class").range())) {
|
||||
if (decodeInt(kv.getValue()) > 0)
|
||||
classNames.add(Tuple.fromBytes(kv.getKey()).getString(1));
|
||||
}
|
||||
return classNames;
|
||||
});
|
||||
}
|
||||
|
||||
private static void drop(TransactionContext db, final String s, final String c) {
|
||||
db.run((Transaction tr) -> {
|
||||
byte[] rec = Tuple.from("attends", s, c).pack();
|
||||
if (tr.get(rec).join() == null)
|
||||
return null; // not taking this class
|
||||
byte[] classKey = Tuple.from("class", c).pack();
|
||||
tr.set(classKey, encodeInt(decodeInt(tr.get(classKey).join()) + 1));
|
||||
tr.clear(rec);
|
||||
return null;
|
||||
});
|
||||
}
|
||||
|
||||
private static void signup(TransactionContext db, final String s, final String c) {
|
||||
db.run((Transaction tr) -> {
|
||||
byte[] rec = Tuple.from("attends", s, c).pack();
|
||||
if (tr.get(rec).join() != null)
|
||||
return null; // already signed up
|
||||
|
||||
int seatsLeft = decodeInt(tr.get(Tuple.from("class", c).pack()).join());
|
||||
if (seatsLeft == 0)
|
||||
throw new IllegalStateException("No remaining seats");
|
||||
|
||||
List<KeyValue> classes = tr.getRange(Tuple.from("attends", s).range()).asList().join();
|
||||
if (classes.size() == 5)
|
||||
throw new IllegalStateException("Too many classes");
|
||||
|
||||
tr.set(Tuple.from("class", c).pack(), encodeInt(seatsLeft - 1));
|
||||
tr.set(rec, Tuple.from("").pack());
|
||||
return null;
|
||||
});
|
||||
}
|
||||
|
||||
private static void switchClasses(TransactionContext db, final String s, final String oldC, final String newC) {
|
||||
db.run((Transaction tr) -> {
|
||||
drop(tr, s, oldC);
|
||||
signup(tr, s, newC);
|
||||
return null;
|
||||
});
|
||||
}
|
||||
|
||||
//
|
||||
// Testing
|
||||
//
|
||||
|
||||
private static void simulateStudents(int i, int ops) {
|
||||
|
||||
String studentID = "s" + Integer.toString(i);
|
||||
List<String> allClasses = classNames;
|
||||
List<String> myClasses = new ArrayList<String>();
|
||||
|
||||
String c;
|
||||
String oldC;
|
||||
String newC;
|
||||
Random rand = new Random();
|
||||
|
||||
for (int j=0; j<ops; j++) {
|
||||
int classCount = myClasses.size();
|
||||
List<String> moods = new ArrayList<String>();
|
||||
if (classCount > 0) {
|
||||
moods.add("drop");
|
||||
moods.add("switch");
|
||||
}
|
||||
if (classCount < 5)
|
||||
moods.add("add");
|
||||
String mood = moods.get(rand.nextInt(moods.size()));
|
||||
|
||||
try {
|
||||
if (allClasses.isEmpty())
|
||||
allClasses = availableClasses(db);
|
||||
if (mood.equals("add")) {
|
||||
c = allClasses.get(rand.nextInt(allClasses.size()));
|
||||
signup(db, studentID, c);
|
||||
myClasses.add(c);
|
||||
} else if (mood.equals("drop")) {
|
||||
c = myClasses.get(rand.nextInt(myClasses.size()));
|
||||
drop(db, studentID, c);
|
||||
myClasses.remove(c);
|
||||
} else if (mood.equals("switch")) {
|
||||
oldC = myClasses.get(rand.nextInt(myClasses.size()));
|
||||
newC = allClasses.get(rand.nextInt(allClasses.size()));
|
||||
switchClasses(db, studentID, oldC, newC);
|
||||
myClasses.remove(oldC);
|
||||
myClasses.add(newC);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
System.out.println(e.getMessage() + "Need to recheck available classes.");
|
||||
allClasses.clear();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static void runSim(int students, final int ops_per_student) throws InterruptedException {
|
||||
List<Thread> threads = new ArrayList<Thread>(students);//Thread[students];
|
||||
for (int i = 0; i < students; i++) {
|
||||
final int j = i;
|
||||
threads.add(new Thread(() -> simulateStudents(j, ops_per_student)) );
|
||||
}
|
||||
for (Thread thread: threads)
|
||||
thread.start();
|
||||
for (Thread thread: threads)
|
||||
thread.join();
|
||||
System.out.format("Ran %d transactions%n", students * ops_per_student);
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws InterruptedException {
|
||||
init(db);
|
||||
System.out.println("Initialized");
|
||||
runSim(10,10);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,515 @@
|
|||
########################
|
||||
Class Scheduling in Ruby
|
||||
########################
|
||||
|
||||
This tutorial provides a walkthrough of designing and building a simple application in Ruby using FoundationDB. In this tutorial, we use a few simple data modeling techniques. For a more in-depth discussion of data modeling in FoundationDB, see :doc:`data-modeling`.
|
||||
|
||||
The concepts in this tutorial are applicable to all the :doc:`languages <api-reference>` supported by FoundationDB. If you prefer, you can see a version of this tutorial in :doc:`Python <class-scheduling>`, :doc:`Java <class-scheduling-java>`, or :doc:`Go <class-scheduling-go>`.
|
||||
|
||||
.. _class-sched-ruby-first-steps:
|
||||
|
||||
First steps
|
||||
===========
|
||||
|
||||
Let's begin with "Hello world."
|
||||
|
||||
If you have not yet installed FoundationDB, see :doc:`getting-started-mac` or :doc:`getting-started-linux`.
|
||||
|
||||
Open a Ruby interactive interpreter and import the FoundationDB API module::
|
||||
|
||||
$ irb
|
||||
> require 'fdb'
|
||||
=> true
|
||||
|
||||
Before using the API, we need to specify the API version. This allows programs to maintain compatibility even if the API is modified in future versions::
|
||||
|
||||
> FDB.api_version 510
|
||||
=> nil
|
||||
|
||||
Next, we open a FoundationDB database. The API will connect to the FoundationDB cluster indicated by the :ref:`default cluster file <default-cluster-file>`. ::
|
||||
|
||||
> @db = FDB.open
|
||||
=> #<FDB::Database:0x007fc2309751e0 @dpointer=#<FFI::Pointer address=0x007fc231c139c0>, @options=#<FDB::DatabaseOptions:0x007fc230975168 @setfunc=#<Proc:0x007fc230975190@/Users/stephenpimentel/.rvm/gems/ruby-2.0.0-p247/gems/fdb-1.0.0/lib/fdbimpl.rb:510 (lambda)>>>
|
||||
|
||||
We are ready to use the database. In Ruby, using the ``[]`` operator on the database object is a convenient syntax for performing a read or write on the database. First, let's simply write a key-value pair::
|
||||
|
||||
> @db['hello'] = 'world'
|
||||
=> "world"
|
||||
|
||||
When this command returns without exception, the modification is durably stored in FoundationDB! Under the covers, this function creates a transaction with a single modification. We'll see later how to do multiple operations in a single transaction. For now, let's read back the data::
|
||||
|
||||
>>> print 'hello ', @db['hello']
|
||||
hello world => nil
|
||||
|
||||
If this is all working, it looks like we are ready to start building a real application. For reference, here's the full code for "hello world":
|
||||
|
||||
.. code-block:: ruby
|
||||
|
||||
require 'fdb'
|
||||
FDB.api_version 510
|
||||
@db = FDB.open
|
||||
@db['hello'] = 'world'
|
||||
print 'hello ', @db['hello']
|
||||
|
||||
Class scheduling application
|
||||
============================
|
||||
|
||||
Let's say we've been asked to build a class scheduling system for students and administrators. We'll walk through the design and implementation of this application. Instead of typing everything in as you follow along, look at the :ref:`class-sched-ruby-appendix` for a finished version of the program. You may want to refer to this code as we walk through the tutorial.
|
||||
|
||||
Requirements
|
||||
------------
|
||||
|
||||
We'll need to let users list available classes and track which students have signed up for which classes. Here's a first cut at the functions we'll need to implement::
|
||||
|
||||
available_classes() # returns list of classes
|
||||
signup(studentID, class) # signs up a student for a class
|
||||
drop(studentID, class) # drops a student from a class
|
||||
|
||||
.. _class-sched-ruby-data-model:
|
||||
|
||||
Data model
|
||||
----------
|
||||
|
||||
First, we need to design a :doc:`data model <data-modeling>`. A data model is just a method for storing our application data using keys and values in FoundationDB. We seem to have two main types of data: (1) a list of classes and (2) a record of which students will attend which classes. Let's keep attending data like this::
|
||||
|
||||
# ['attends', student, class] = ''
|
||||
|
||||
We'll just store the key with a blank value to indicate that a student is signed up for a particular class. For this application, we're going to think about a key-value pair's key as a :ref:`tuple <data-modeling-tuples>`. Encoding a tuple of data elements into a key is a very common pattern for an ordered key-value store.
|
||||
|
||||
We'll keep data about classes like this::
|
||||
|
||||
# ['class', class_name] = seats_available
|
||||
|
||||
Similarly, each such key will represent an available class. We'll use ``seats_available`` to record the number of seats available.
|
||||
|
||||
Transactions
|
||||
------------
|
||||
|
||||
We're going to rely on the powerful guarantees of transactions to help keep all of our modifications straight, so let's look at a nice way that the FoundationDB Ruby API lets you write a transactional function. The :meth:`transact` method ensures that a code block is executed transactionally. Let's write the very simple ``add_class`` function we will use to populate the database's class list:
|
||||
|
||||
.. code-block:: ruby
|
||||
|
||||
def add_class(db_or_tr, c)
|
||||
db_or_tr.transact do |tr|
|
||||
tr[FDB::Tuple.pack(['class',c])] = '100'
|
||||
end
|
||||
end
|
||||
|
||||
A function using this approach has a parameter taking either a :class:`Database` or :class:`Transaction` on which it calls the :meth:`transact` method. The block passed to :meth:`transact` is parameterized by the transaction the function will use to do reads and writes.
|
||||
|
||||
When *calling* such a function, however, you can pass a :class:`Database` instead of a :class:`Transaction`. The method *automatically creates a transaction and implements a retry loop* to ensure that the transaction eventually commits.
|
||||
|
||||
For a FoundationDB database ``@db``:
|
||||
|
||||
.. code-block:: ruby
|
||||
|
||||
add_class(@db, 'class1')
|
||||
|
||||
is equivalent to something like:
|
||||
|
||||
.. code-block:: ruby
|
||||
|
||||
tr = @db.create_transaction
|
||||
committed = false
|
||||
while !committed
|
||||
begin
|
||||
tr[FDB::Tuple.pack(['class',c])] = '100'
|
||||
tr.commit.wait
|
||||
committed = true
|
||||
rescue FDB::Error => e
|
||||
tr.on_error(e).wait
|
||||
end
|
||||
end
|
||||
|
||||
If instead you pass a :class:`Transaction` for the ``db_or_tr`` parameter, the transaction will be used directly, and it is assumed that the caller implements appropriate retry logic for errors. This permits functions using this pattern to be composed into larger transactions.
|
||||
|
||||
Making some sample classes
|
||||
--------------------------
|
||||
|
||||
Let's make some sample classes and put them in the ``@class_names`` variable. We'll make individual classes from combinations of class types, levels, and times:
|
||||
|
||||
.. code-block:: ruby
|
||||
|
||||
# Generate 1,620 classes like '9:00 chem for dummies'
|
||||
levels = ['intro', 'for dummies', 'remedial', '101',
|
||||
'201', '301', 'mastery', 'lab', 'seminar']
|
||||
types = ['chem', 'bio', 'cs', 'geometry', 'calc',
|
||||
'alg', 'film', 'music', 'art', 'dance']
|
||||
times = Array(2...20).map {|h| h.to_s.encode('UTF-8') + ':00'}
|
||||
class_combos = times.product(types, levels)
|
||||
@class_names = class_combos.map {|combo| combo.join(' ')}
|
||||
|
||||
Initializing the database
|
||||
-------------------------
|
||||
We initialize the database with our class list:
|
||||
|
||||
.. code-block:: ruby
|
||||
|
||||
def init(db_or_tr)
|
||||
db_or_tr.transact do |tr|
|
||||
tr.clear_range_start_with(FDB::Tuple.pack(['attends']))
|
||||
tr.clear_range_start_with(FDB::Tuple.pack(['class']))
|
||||
@class_names.each do |class_name|
|
||||
add_class(tr, class_name)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
After :meth:`init` is run, the database will contain all of the sample classes we created above.
|
||||
|
||||
Listing available classes
|
||||
-------------------------
|
||||
|
||||
Before students can do anything else, they need to be able to retrieve a list of available classes from the database. Because FoundationDB sorts its data by key and therefore has efficient range-read capability, we can retrieve all of the classes in a single database call. We find this range of keys with :meth:`get_range`:
|
||||
|
||||
.. code-block:: ruby
|
||||
|
||||
def available_classes(db_or_tr)
|
||||
db_or_tr.transact do |tr|
|
||||
r = FDB::Tuple.range(['class'])
|
||||
tr.get_range(r[0], r[1]) {|kv| FDB::Tuple.unpack(kv.key)[1]}
|
||||
end
|
||||
end
|
||||
|
||||
In general, the :meth:`FDB::Tuple.range` method returns an Array of two elements representing the begin and end of the range of all the key-value pairs starting with the specified tuple. In this case, we want all classes, so we call :meth:`FDB::Tuple.range` with the tuple ``['class']``. :meth:`get_range` returns an enumerable of the key-values specified by its range. To extract the class name, we unpack the key into a tuple using :meth:`FDB::Tuple.unpack` and take its second part. (The first part is the prefix ``'class'``.)
|
||||
|
||||
Signing up for a class
|
||||
----------------------
|
||||
|
||||
We finally get to the crucial function. A student has decided on a class (by name) and wants to sign up. The ``signup`` function will take a student (``s``) and a class (``c``):
|
||||
|
||||
.. code-block:: ruby
|
||||
|
||||
def signup(db_or_tr, s, c)
|
||||
db_or_tr.transact do |tr|
|
||||
rec = FDB::Tuple.pack(['attends', s, c])
|
||||
tr[rec] = ''
|
||||
end
|
||||
end
|
||||
|
||||
We simply insert the appropriate record (with a blank value).
|
||||
|
||||
Dropping a class
|
||||
----------------
|
||||
|
||||
Dropping a class is similar to signing up:
|
||||
|
||||
.. code-block:: ruby
|
||||
|
||||
def drop(db_or_tr, s, c)
|
||||
db_or_tr.transact do |tr|
|
||||
rec = FDB::Tuple.pack(['attends', s, c])
|
||||
tr.clear(rec)
|
||||
end
|
||||
end
|
||||
|
||||
Of course, to actually drop the student from the class, we need to be able to delete a record from the database. We do this with the :meth:`clear` method.
|
||||
|
||||
Done?
|
||||
-----
|
||||
|
||||
We report back to the project leader that our application is done---students can sign up for, drop, and list classes. Unfortunately, we learn that there has been a bit of scope creep in the mean time. Popular classes are getting over-subscribed, and our application is going to need to enforce the class size constraint as students add and drop classes.
|
||||
|
||||
Seats are limited!
|
||||
------------------
|
||||
|
||||
Let's go back to the data model. Remember that we stored the number of seats in the class in the value of the key-value entry in the class list. Let's refine that a bit to track the *remaining* number of seats in the class. The initialization can work the same way (in our example, all classes initially have 100 seats), but the ``available_classes``, ``signup``, and ``drop`` functions are going to have to change. Let's start with ``available_classes``:
|
||||
|
||||
.. code-block:: ruby
|
||||
|
||||
def available_classes(db_or_tr)
|
||||
db_or_tr.transact do |tr|
|
||||
r = FDB::Tuple.range(['class'])
|
||||
tr.get_range(r[0], r[1]) {|kv| FDB::Tuple.unpack(kv.key)[1] if kv.value.to_i > 0}
|
||||
end
|
||||
end
|
||||
|
||||
This is easy -- we simply add a condition to check that the value is non-zero. Let's check out ``signup`` next:
|
||||
|
||||
.. code-block:: ruby
|
||||
:emphasize-lines: 4-13
|
||||
|
||||
def signup(db_or_tr, s, c)
|
||||
db_or_tr.transact do |tr|
|
||||
rec = FDB::Tuple.pack(['attends', s, c])
|
||||
if not tr[rec].nil?
|
||||
return # already signed up
|
||||
end
|
||||
|
||||
seats_left = tr[FDB::Tuple.pack(['class', c])].to_i
|
||||
if seats_left == 0
|
||||
raise 'No remaining seats'
|
||||
end
|
||||
|
||||
tr[FDB::Tuple.pack(['class',c])] = (seats_left - 1).to_s.encode('UTF-8')
|
||||
tr[rec] = ''
|
||||
end
|
||||
end
|
||||
|
||||
We now have to check that we aren't already signed up, since we don't want a double sign up to decrease the number of seats twice. Then we look up how many seats are left to make sure there is a seat remaining so we don't push the counter into the negative. If there is a seat remaining, we decrement the counter.
|
||||
|
||||
|
||||
Concurrency and consistency
|
||||
---------------------------
|
||||
|
||||
The ``signup`` function is starting to get a bit complex; it now reads and writes a few different key-value pairs in the database. One of the tricky issues in this situation is what happens as multiple clients/students read and modify the database at the same time. Couldn't two students both see one remaining seat and sign up at the same time?
|
||||
|
||||
These are tricky issues without simple answers---unless you have transactions! Because these functions are defined as FoundationDB transactions, we can have a simple answer: Each transactional function behaves as if it is the only one modifying the database. There is no way for a transaction to 'see' another transaction change the database, and each transaction ensures that either all of its modifications occur or none of them do.
|
||||
|
||||
Looking deeper, it is, of course, possible for two transactions to conflict. For example, if two people both see a class with one seat and sign up at the same time, FoundationDB must allow only one to succeed. This causes one of the transactions to fail to commit (which can also be caused by network outages, crashes, etc.). To ensure correct operation, applications need to handle this situation, usually via retrying the transaction. In this case, the conflicting transaction will be retried automatically by the :meth:`transact` method and will eventually lead to the correct result, a 'No remaining seats' exception.
|
||||
|
||||
Idempotence
|
||||
-----------
|
||||
|
||||
Occasionally, a transaction might be retried even after it succeeds (for example, if the client loses contact with the cluster at just the wrong moment). This can cause problems if transactions are not written to be idempotent, i.e. to have the same effect if committed twice as if committed once. There are generic design patterns for :ref:`making any transaction idempotent <developer-guide-unknown-results>`, but many transactions are naturally idempotent. For example, all of the transactions in this tutorial are idempotent.
|
||||
|
||||
Dropping with limited seats
|
||||
---------------------------
|
||||
|
||||
Let's finish up the limited seats feature by modifying the drop function:
|
||||
|
||||
.. code-block:: ruby
|
||||
:emphasize-lines: 4-8
|
||||
|
||||
def drop(db_or_tr, s, c)
|
||||
db_or_tr.transact do |tr|
|
||||
rec = FDB::Tuple.pack(['attends', s, c])
|
||||
if tr[rec].nil?
|
||||
return # not taking this class
|
||||
end
|
||||
class_key = FDB::Tuple.pack(['class',c])
|
||||
tr[class_key] = (tr[class_key].to_i + 1).to_s.encode('UTF-8')
|
||||
tr.clear(rec)
|
||||
end
|
||||
end
|
||||
|
||||
This case is easier than signup because there are no constraints we can hit. We just need to make sure the student is in the class and to "give back" one seat when the student drops.
|
||||
|
||||
More features?!
|
||||
---------------
|
||||
|
||||
Of course, as soon as our new version of the system goes live, we hear of a trick that certain students are using. They are signing up for all classes immediately, and only later dropping those that they don't want to take. This has led to an unusable system, and we have been asked to fix it. We decide to limit students to five classes:
|
||||
|
||||
.. code-block:: ruby
|
||||
:emphasize-lines: 13-17
|
||||
|
||||
def signup(db_or_tr, s, c)
|
||||
db_or_tr.transact do |tr|
|
||||
rec = FDB::Tuple.pack(['attends', s, c])
|
||||
if not tr[rec].nil?
|
||||
return # already signed up
|
||||
end
|
||||
|
||||
seats_left = tr[FDB::Tuple.pack(['class', c])].to_i
|
||||
if seats_left == 0
|
||||
raise 'No remaining seats'
|
||||
end
|
||||
|
||||
r = FDB::Tuple.range(['attends', s])
|
||||
classes = tr.get_range(r[0], r[1])
|
||||
if classes.count == 5
|
||||
raise 'Too many classes'
|
||||
end
|
||||
|
||||
tr[FDB::Tuple.pack(['class',c])] = (seats_left - 1).to_s.encode('UTF-8')
|
||||
tr[rec] = ''
|
||||
end
|
||||
end
|
||||
|
||||
Fortunately, we decided on a data model that keeps all of the attending records for a single student together. With this approach, we can use a single range read to retrieve all the classes that a student attends. We simply throw an exception if the number of classes has reached the limit of five.
|
||||
|
||||
Composing transactions
|
||||
----------------------
|
||||
|
||||
Oh, just one last feature, we're told. We have students that are trying to switch from one popular class to another. By the time they drop one class to free up a slot for themselves, the open slot in the other class is gone. By the time they see this and try to re-add their old class, that slot is gone too! So, can we make it so that a student can switch from one class to another without this worry?
|
||||
|
||||
Fortunately, we have FoundationDB, and this sounds an awful lot like the transactional property of atomicity---the all-or-nothing behavior that we already rely on. All we need to do is to *compose* the ``drop`` and ``signup`` functions into a new ``switch`` function. This makes the ``switch`` function exceptionally easy:
|
||||
|
||||
.. code-block:: ruby
|
||||
|
||||
def switch(db_or_tr, s, old_c, new_c)
|
||||
db_or_tr.transact do |tr|
|
||||
drop(tr, s, old_c)
|
||||
signup(tr, s, new_c)
|
||||
end
|
||||
end
|
||||
|
||||
The simplicity of this implementation belies the sophistication of what FoundationDB is taking care of for us.
|
||||
|
||||
By dropping the old class and signing up for the new one inside a single transaction, we ensure that either both steps happen, or that neither happens. The first notable thing about the ``switch`` function is that it is transactional, but it also calls the transactional functions ``signup`` and ``drop``. Because these transactional functions can accept either a database or an existing transaction as the ``tr`` argument, the ``switch`` function can be called with a database by a simple client, and a new transaction will be automatically created. However, once this transaction is created and passed in as ``tr``, the calls to ``drop`` and ``signup`` both share the same ``tr``. This ensures that they see each other's modifications to the database, and all of the changes that both of them make in sequence are made transactionally when the ``switch`` function returns. This compositional capability is very powerful.
|
||||
|
||||
Also note that, if an exception is raised, for example, in ``signup``, the exception is not caught by ``switch`` and so will be thrown to the calling function. In this case, the transaction object (owned by the :meth:`transact` method) is destroyed, automatically rolling back all database modifications, leaving the database completely unchanged by the half-executed function.
|
||||
|
||||
Are we done?
|
||||
------------
|
||||
|
||||
Yep, we're done. Fortunately, our UI team built an awesome UI while we were working on our back end, and we are ready to deploy. If you want to see this entire application in one place plus some multithreaded testing code to simulate concurrency, look at the :ref:`class-sched-ruby-appendix`, below.
|
||||
|
||||
Deploying and scaling
|
||||
---------------------
|
||||
|
||||
Since we store all state for this application in FoundationDB, deploying and scaling this solution up is impressively painless. Just run a web server, the UI, this back end, and point the whole thing at FoundationDB. We can run as many computers with this setup as we want, and they can all hit the database at the same time because of the transactional integrity of FoundationDB. Also, since all of the state in the system is stored in the database, any of these computers can fail without any lasting consequences.
|
||||
|
||||
Next steps
|
||||
==========
|
||||
|
||||
* See :doc:`data-modeling` for guidance on using tuple and subspaces to enable effective storage and retrieval of data.
|
||||
* See :doc:`developer-guide` for general guidance on development using FoundationDB.
|
||||
* See the :doc:`API References <api-reference>` for detailed API documentation.
|
||||
|
||||
.. _class-sched-ruby-appendix:
|
||||
|
||||
Appendix: class_scheduling.rb
|
||||
===============================
|
||||
|
||||
Here's the code for the scheduling tutorial:
|
||||
|
||||
.. code-block:: ruby
|
||||
|
||||
require 'fdb'
|
||||
|
||||
FDB.api_version 510
|
||||
|
||||
####################################
|
||||
## Initialization ##
|
||||
####################################
|
||||
|
||||
# Data model:
|
||||
# ['attends', student, class] = ''
|
||||
# ['class', class_name] = seats_left
|
||||
|
||||
@db = FDB.open
|
||||
|
||||
def add_class(db_or_tr, c)
|
||||
db_or_tr.transact do |tr|
|
||||
tr[FDB::Tuple.pack(['class',c])] = '100'
|
||||
end
|
||||
end
|
||||
|
||||
# Generate 1,620 classes like '9:00 chem for dummies'
|
||||
levels = ['intro', 'for dummies', 'remedial', '101',
|
||||
'201', '301', 'mastery', 'lab', 'seminar']
|
||||
types = ['chem', 'bio', 'cs', 'geometry', 'calc',
|
||||
'alg', 'film', 'music', 'art', 'dance']
|
||||
times = Array(2...20).map {|h| h.to_s.encode('UTF-8') + ':00'}
|
||||
class_combos = times.product(types, levels)
|
||||
@class_names = class_combos.map {|combo| combo.join(' ')}
|
||||
|
||||
def init(db_or_tr)
|
||||
db_or_tr.transact do |tr|
|
||||
tr.clear_range_start_with(FDB::Tuple.pack(['attends']))
|
||||
tr.clear_range_start_with(FDB::Tuple.pack(['class']))
|
||||
@class_names.each do |class_name|
|
||||
add_class(tr, class_name)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
def available_classes(db_or_tr)
|
||||
db_or_tr.transact do |tr|
|
||||
r = FDB::Tuple.range(['class'])
|
||||
tr.get_range(r[0], r[1]) {|kv| FDB::Tuple.unpack(kv.key)[1] if kv.value.to_i > 0}
|
||||
end
|
||||
end
|
||||
|
||||
def signup(db_or_tr, s, c)
|
||||
db_or_tr.transact do |tr|
|
||||
rec = FDB::Tuple.pack(['attends', s, c])
|
||||
if not tr[rec].nil?
|
||||
return # already signed up
|
||||
end
|
||||
|
||||
seats_left = tr[FDB::Tuple.pack(['class', c])].to_i
|
||||
if seats_left == 0
|
||||
raise 'No remaining seats'
|
||||
end
|
||||
|
||||
r = FDB::Tuple.range(['attends', s])
|
||||
classes = tr.get_range(r[0], r[1])
|
||||
if classes.count == 5
|
||||
raise 'Too many classes'
|
||||
end
|
||||
|
||||
tr[FDB::Tuple.pack(['class',c])] = (seats_left - 1).to_s.encode('UTF-8')
|
||||
tr[rec] = ''
|
||||
end
|
||||
end
|
||||
|
||||
def drop(db_or_tr, s, c)
|
||||
db_or_tr.transact do |tr|
|
||||
rec = FDB::Tuple.pack(['attends', s, c])
|
||||
if tr[rec].nil?
|
||||
return # not taking this class
|
||||
end
|
||||
class_key = FDB::Tuple.pack(['class',c])
|
||||
tr[class_key] = (tr[class_key].to_i + 1).to_s.encode('UTF-8')
|
||||
tr.clear(rec)
|
||||
end
|
||||
end
|
||||
|
||||
def switch(db_or_tr, s, old_c, new_c)
|
||||
db_or_tr.transact do |tr|
|
||||
drop(tr, s, old_c)
|
||||
signup(tr, s, new_c)
|
||||
end
|
||||
end
|
||||
|
||||
####################################
|
||||
## Testing ##
|
||||
####################################
|
||||
|
||||
def indecisive_student(i, ops)
|
||||
student_ID = "s%d" % i
|
||||
all_classes = @class_names
|
||||
my_classes = []
|
||||
|
||||
Array(0...ops).each do |i|
|
||||
class_count = my_classes.length
|
||||
moods = []
|
||||
if class_count > 0
|
||||
moods.push('drop','switch')
|
||||
end
|
||||
if class_count < 5
|
||||
moods.push('add')
|
||||
end
|
||||
mood = moods.sample
|
||||
|
||||
begin
|
||||
if all_classes.empty?
|
||||
all_classes = available_classes(@db)
|
||||
end
|
||||
if mood == 'add'
|
||||
c = all_classes.sample
|
||||
signup(@db, student_ID, c)
|
||||
my_classes.push(c)
|
||||
elsif mood == 'drop'
|
||||
c = my_classes.sample
|
||||
drop(@db, student_ID, c)
|
||||
my_classes.delete(c)
|
||||
elsif mood == 'switch'
|
||||
old_c = my_classes.sample
|
||||
new_c = all_classes.sample
|
||||
switch(@db, student_ID, old_c, new_c)
|
||||
my_classes.delete(old_c)
|
||||
my_classes.push(new_c)
|
||||
end
|
||||
rescue => e
|
||||
print e, "Need to recheck available classes."
|
||||
all_classes = []
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
def run(students, ops_per_student)
|
||||
threads = Array(0...students).map {|i| Thread.new(i, ops_per_student) {
|
||||
indecisive_student(i, ops_per_student)}
|
||||
}
|
||||
threads.each {|thr| thr.join}
|
||||
print "Ran %d transactions" % (students * ops_per_student)
|
||||
end
|
||||
|
||||
if __FILE__ == $0
|
||||
init(@db)
|
||||
print "initialized"
|
||||
run(10, 10)
|
||||
end
|
|
@ -0,0 +1,463 @@
|
|||
################
|
||||
Class Scheduling
|
||||
################
|
||||
|
||||
This tutorial provides a walkthrough of designing and building a simple application in Python using FoundationDB. In this tutorial, we use a few simple data modeling techniques. For a more in-depth discussion of data modeling in FoundationDB, see :doc:`data-modeling`.
|
||||
|
||||
The concepts in this tutorial are applicable to all the :doc:`languages <api-reference>` supported by FoundationDB. If you prefer, you can see a version of this tutorial in:
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
:titlesonly:
|
||||
|
||||
Ruby <class-scheduling-ruby>
|
||||
Java <class-scheduling-java>
|
||||
Go <class-scheduling-go>
|
||||
|
||||
.. _tutorial-first-steps:
|
||||
|
||||
First steps
|
||||
===========
|
||||
|
||||
Let's begin with "Hello world."
|
||||
|
||||
If you have not yet installed FoundationDB, see :doc:`getting-started-mac` or :doc:`getting-started-linux`.
|
||||
|
||||
Open a Python interactive interpreter and import the FoundationDB API module::
|
||||
|
||||
$ python
|
||||
>>> import fdb
|
||||
|
||||
Before using the API, we need to specify the API version. This allows programs to maintain compatibility even if the API is modified in future versions::
|
||||
|
||||
>>> fdb.api_version(510)
|
||||
|
||||
Next, we open a FoundationDB database. The API will connect to the FoundationDB cluster indicated by the :ref:`default cluster file <default-cluster-file>`. ::
|
||||
|
||||
>>> db = fdb.open()
|
||||
|
||||
We are ready to use the database. In Python, using the ``[]`` operator on the db object is a convenient syntax for performing a read or write on the database. First, let's simply write a key-value pair:
|
||||
|
||||
>>> db['hello'] = 'world'
|
||||
|
||||
When this command returns without exception, the modification is durably stored in FoundationDB! Under the covers, this function creates a transaction with a single modification. We'll see later how to do multiple operations in a single transaction. For now, let's read back the data::
|
||||
|
||||
>>> print 'hello', db['hello']
|
||||
hello world
|
||||
|
||||
If this is all working, it looks like we are ready to start building a real application. For reference, here's the full code for "hello world"::
|
||||
|
||||
import fdb
|
||||
fdb.api_version(510)
|
||||
db = fdb.open()
|
||||
db['hello'] = 'world'
|
||||
print 'hello', db['hello']
|
||||
|
||||
Class scheduling application
|
||||
============================
|
||||
|
||||
Let's say we've been asked to build a class scheduling system for students and administrators. We'll walk through the design and implementation of this application. Instead of typing everything in as you follow along, look at the :ref:`tutorial-appendix` for a finished version of the program. You may want to refer to this code as we walk through the tutorial.
|
||||
|
||||
Requirements
|
||||
------------
|
||||
|
||||
We'll need to let users list available classes and track which students have signed up for which classes. Here's a first cut at the functions we'll need to implement::
|
||||
|
||||
available_classes() # returns list of classes
|
||||
signup(studentID, class) # signs up a student for a class
|
||||
drop(studentID, class) # drops a student from a class
|
||||
|
||||
.. _tutorial-data-model:
|
||||
|
||||
Data model
|
||||
----------
|
||||
|
||||
First, we need to design a :doc:`data model <data-modeling>`. A data model is just a method for storing our application data using keys and values in FoundationDB. We seem to have two main types of data: (1) a list of classes and (2) a record of which students will attend which classes. Let's keep attending data like this::
|
||||
|
||||
# ('attends', student, class) = ''
|
||||
|
||||
We'll just store the key with a blank value to indicate that a student is signed up for a particular class. For this application, we're going to think about a key-value pair's key as a :ref:`tuple <data-modeling-tuples>`. Encoding a tuple of data elements into a key is a very common pattern for an ordered key-value store.
|
||||
|
||||
We'll keep data about classes like this::
|
||||
|
||||
# ('class', class_name) = seats_available
|
||||
|
||||
Similarly, each such key will represent an available class. We'll use ``seats_available`` to record the number of seats available.
|
||||
|
||||
Directories and Subspaces
|
||||
-------------------------
|
||||
|
||||
FoundationDB includes a few tools that make it easy to model data using this approach. Let's begin by
|
||||
opening a :ref:`directory <developer-guide-directories>` in the database::
|
||||
|
||||
import fdb
|
||||
fdb.api_version(510)
|
||||
|
||||
db = fdb.open()
|
||||
scheduling = fdb.directory.create_or_open(db, ('scheduling',))
|
||||
|
||||
The :meth:`create_or_open` method returns a :ref:`subspace <developer-guide-sub-keyspaces>` where we'll store our application data. Each subspace has a fixed prefix it uses when defining keys. The prefix corresponds to the first element of a tuple. We decided that we wanted ``'attends'`` and ``'class'`` as our prefixes, so we'll create new subspaces for them within the ``scheduling`` subspace.::
|
||||
|
||||
course = scheduling['class']
|
||||
attends = scheduling['attends']
|
||||
|
||||
Subspaces have a :meth:`pack` method for defining keys. To store the records for our data model, we can use ``attends.pack((s, c))`` and ``course.pack((c,))``.
|
||||
|
||||
Transactions
|
||||
------------
|
||||
|
||||
We're going to rely on the powerful guarantees of transactions to help keep all of our modifications straight, so let's look at a nice way that the FoundationDB Python API lets you write a transactional function. By using a decorator, an entire function is wrapped in a transaction. Let's write the very simple ``add_class`` function we will use to populate the database's class list::
|
||||
|
||||
@fdb.transactional
|
||||
def add_class(tr, c):
|
||||
tr[course.pack((c,))] = bytes(100)
|
||||
|
||||
:py:func:`@fdb.transactional <fdb.transactional>` is a Python decorator that makes a normal function a transactional function. All functions decorated this way *need to have a parameter named* ``tr``. This parameter is passed the transaction that the function should use to do reads and writes.
|
||||
|
||||
When *calling* a transactionally decorated function, however, you can pass a database instead of a transaction for the ``tr`` parameter. The decorator *automatically creates a transaction and implements a retry loop* to ensure that the transaction eventually commits.
|
||||
|
||||
For a FoundationDB database ``db``::
|
||||
|
||||
add_class(db, 'class1')
|
||||
|
||||
is equivalent to something like::
|
||||
|
||||
tr = db.create_transaction()
|
||||
while True:
|
||||
try:
|
||||
add_class(tr, 'class1')
|
||||
tr.commit().wait()
|
||||
break
|
||||
except fdb.FDBError as e:
|
||||
tr.on_error(e).wait()
|
||||
|
||||
If instead you pass a :class:`Transaction` for the ``tr`` parameter, the transaction will be used directly, and it is assumed that the caller implements appropriate retry logic for errors. This permits transactionally decorated functions to be composed into larger transactions.
|
||||
|
||||
Making some sample classes
|
||||
--------------------------
|
||||
|
||||
Let's make some sample classes and put them in the ``class_names`` variable. The Python ``itertools`` module is used to make individual classes from combinations of class types, levels, and times::
|
||||
|
||||
import itertools
|
||||
|
||||
# Generate 1,620 classes like '9:00 chem for dummies'
|
||||
levels = ['intro', 'for dummies', 'remedial', '101',
|
||||
'201', '301', 'mastery', 'lab', 'seminar']
|
||||
types = ['chem', 'bio', 'cs', 'geometry', 'calc',
|
||||
'alg', 'film', 'music', 'art', 'dance']
|
||||
times = [str(h) + ':00' for h in range(2, 20)]
|
||||
class_combos = itertools.product(times, types, levels)
|
||||
class_names = [' '.join(tup) for tup in class_combos]
|
||||
|
||||
Initializing the database
|
||||
-------------------------
|
||||
We initialize the database with our class list::
|
||||
|
||||
@fdb.transactional
|
||||
def init(tr):
|
||||
del tr[scheduling.range(())] # Clear the directory
|
||||
for class_name in class_names:
|
||||
add_class(tr, class_name)
|
||||
|
||||
After :func:`init` is run, the database will contain all of the sample classes we created above.
|
||||
|
||||
Listing available classes
|
||||
-------------------------
|
||||
|
||||
Before students can do anything else, they need to be able to retrieve a list of available classes from the database. Because FoundationDB sorts its data by key and therefore has efficient range-read capability, we can retrieve all of the classes in a single database call. We find this range of keys with :meth:`course.range`.::
|
||||
|
||||
@fdb.transactional
|
||||
def available_classes(tr):
|
||||
return [course.unpack(k)[0] for k, v in tr[course.range(())]]
|
||||
|
||||
In general, the :meth:`Subspace.range` method returns a Python ``slice`` representing all the key-value pairs starting with the specified tuple. In this case, we want all classes, so we call :meth:`course.range` with the empty tuple ``()``. FoundationDB's ``tr[slice]`` function returns an iterable list of key-values in the range specified by the slice. We unpack the key ``k`` and value ``v`` in a comprehension. To extract the class name itself, we unpack the key into a tuple using the :py:mod:`fdb.tuple` module and take its third part. (The first and second parts are the prefixes for the ``scheduling`` and ``course`` subspaces, respectively.)
|
||||
|
||||
Signing up for a class
|
||||
----------------------
|
||||
|
||||
We finally get to the crucial function. A student has decided on a class (by name) and wants to sign up. The ``signup`` function will take a student (``s``) and a class (``c``)::
|
||||
|
||||
@fdb.transactional
|
||||
def signup(tr, s, c):
|
||||
rec = attends.pack((s, c))
|
||||
tr[rec] = ''
|
||||
|
||||
We simply insert the appropriate record (with a blank value).
|
||||
|
||||
Dropping a class
|
||||
----------------
|
||||
|
||||
Dropping a class is similar to signing up::
|
||||
|
||||
@fdb.transactional
|
||||
def drop(tr, s, c):
|
||||
rec = attends.pack((s, c))
|
||||
del tr[rec]
|
||||
|
||||
Of course, to actually drop the student from the class, we need to be able to delete a record from the database. We do this with the ``del tr[key]`` syntax.
|
||||
|
||||
Done?
|
||||
-----
|
||||
|
||||
We report back to the project leader that our application is done---students can sign up for, drop, and list classes. Unfortunately, we learn that there has been a bit of scope creep in the mean time. Popular classes are getting over-subscribed, and our application is going to need to enforce the class size constraint as students add and drop classes.
|
||||
|
||||
Seats are limited!
|
||||
------------------
|
||||
|
||||
Let's go back to the data model. Remember that we stored the number of seats in the class in the value of the key-value entry in the class list. Let's refine that a bit to track the *remaining* number of seats in the class. The initialization can work the same way. (In our example, all classes initially have 100 seats), but the ``available_classes``, ``signup``, and ``drop`` functions are going to have to change. Let's start with ``available_classes``:
|
||||
|
||||
.. code-block:: python
|
||||
:emphasize-lines: 4
|
||||
|
||||
@fdb.transactional
|
||||
def available_classes(tr):
|
||||
return [fdb.tuple.unpack(k)[2] for k, v in tr[course.range(())]
|
||||
if int(v)]
|
||||
|
||||
This is easy -- we simply add a condition to check that the value is non-zero. Let's check out ``signup`` next:
|
||||
|
||||
.. code-block:: python
|
||||
:emphasize-lines: 4,5,6,7,8,9
|
||||
|
||||
@fdb.transactional
|
||||
def signup(tr, s, c):
|
||||
rec = attends.pack((s, c))
|
||||
if tr[rec].present(): return # already signed up
|
||||
|
||||
seats_left = int(tr[course.pack((c,))])
|
||||
if not seats_left: raise Exception('No remaining seats')
|
||||
|
||||
tr[course.pack((c,))] = bytes(seats_left - 1)
|
||||
tr[rec] = ''
|
||||
|
||||
We now have to check that we aren't already signed up, since we don't want a double sign up to decrease the number of seats twice. Then we look up how many seats are left to make sure there is a seat remaining so we don't push the counter into the negative. If there is a seat remaining, we decrement the counter.
|
||||
|
||||
|
||||
Concurrency and consistency
|
||||
---------------------------
|
||||
|
||||
The ``signup`` function is starting to get a bit complex; it now reads and writes a few different key-value pairs in the database. One of the tricky issues in this situation is what happens as multiple clients/students read and modify the database at the same time. Couldn't two students both see one remaining seat and sign up at the same time?
|
||||
|
||||
These are tricky issues without simple answers---unless you have transactions! Because these functions are defined as FoundationDB transactions, we can have a simple answer: Each transactional function behaves as if it is the only one modifying the database. There is no way for a transaction to 'see' another transaction change the database, and each transaction ensures that either all of its modifications occur or none of them do.
|
||||
|
||||
Looking deeper, it is, of course, possible for two transactions to conflict. For example, if two people both see a class with one seat and sign up at the same time, FoundationDB must allow only one to succeed. This causes one of the transactions to fail to commit (which can also be caused by network outages, crashes, etc.). To ensure correct operation, applications need to handle this situation, usually via retrying the transaction. In this case, the conflicting transaction will be retried automatically by the ``@fdb.transactional`` decorator and will eventually lead to the correct result, a 'No remaining seats' exception.
|
||||
|
||||
Idempotence
|
||||
-----------
|
||||
|
||||
Occasionally, a transaction might be retried even after it succeeds (for example, if the client loses contact with the cluster at just the wrong moment). This can cause problems if transactions are not written to be idempotent, i.e. to have the same effect if committed twice as if committed once. There are generic design patterns for :ref:`making any transaction idempotent <developer-guide-unknown-results>`, but many transactions are naturally idempotent. For example, all of the transactions in this tutorial are idempotent.
|
||||
|
||||
Dropping with limited seats
|
||||
---------------------------
|
||||
|
||||
Let's finish up the limited seats feature by modifying the drop function:
|
||||
|
||||
.. code-block:: python
|
||||
:emphasize-lines: 4,5
|
||||
|
||||
@fdb.transactional
|
||||
def drop(tr, s, c):
|
||||
rec = attends.pack((s, c))
|
||||
if not tr[rec].present(): return # not taking this class
|
||||
tr[course.pack((c,))] = bytes(int(tr[course.pack((c,))]) + 1)
|
||||
del tr[rec]
|
||||
|
||||
This case is easier than signup because there are no constraints we can hit. We just need to make sure the student is in the class and to "give back" one seat when the student drops.
|
||||
|
||||
More features?!
|
||||
---------------
|
||||
|
||||
Of course, as soon as our new version of the system goes live, we hear of a trick that certain students are using. They are signing up for all classes immediately, and only later dropping those that they don't want to take. This has led to an unusable system, and we have been asked to fix it. We decide to limit students to five classes:
|
||||
|
||||
.. code-block:: python
|
||||
:emphasize-lines: 9,10
|
||||
|
||||
@fdb.transactional
|
||||
def signup(tr, s, c):
|
||||
rec = attends.pack((s, c))
|
||||
if tr[rec].present(): return # already signed up
|
||||
|
||||
seats_left = int(tr[course.pack((c,))])
|
||||
if not seats_left: raise Exception('No remaining seats')
|
||||
|
||||
classes = tr[attends.range((s,))]
|
||||
if len(list(classes)) == 5: raise Exception('Too many classes')
|
||||
|
||||
tr[course.pack((c,))] = bytes(seats_left - 1)
|
||||
tr[rec] = ''
|
||||
|
||||
Fortunately, we decided on a data model that keeps all of the attending records for a single student together. With this approach, we can use a single range read in the ``attends`` subspace to retrieve all the classes that a student is signed up for. We simply throw an exception if the number of classes has reached the limit of five.
|
||||
|
||||
Composing transactions
|
||||
----------------------
|
||||
|
||||
Oh, just one last feature, we're told. We have students that are trying to switch from one popular class to another. By the time they drop one class to free up a slot for themselves, the open slot in the other class is gone. By the time they see this and try to re-add their old class, that slot is gone too! So, can we make it so that a student can switch from one class to another without this worry?
|
||||
|
||||
Fortunately, we have FoundationDB, and this sounds an awful lot like the transactional property of atomicity---the all-or-nothing behavior that we already rely on. All we need to do is to *compose* the ``drop`` and ``signup`` functions into a new ``switch`` function. This makes the ``switch`` function exceptionally easy::
|
||||
|
||||
@fdb.transactional
|
||||
def switch(tr, s, old_c, new_c):
|
||||
drop(tr, s, old_c)
|
||||
signup(tr, s, new_c)
|
||||
|
||||
The simplicity of this implementation belies the sophistication of what FoundationDB is taking care of for us.
|
||||
|
||||
By dropping the old class and signing up for the new one inside a single transaction, we ensure that either both steps happen, or that neither happens. The first notable thing about the ``switch`` function is that it is transactionally decorated, but it also calls the transactionally decorated functions ``signup`` and ``drop``. Because these decorated functions can accept either a database or an existing transaction as the ``tr`` argument, the switch function can be called with a database by a simple client, and a new transaction will be automatically created. However, once this transaction is created and passed in as ``tr``, the calls to ``drop`` and ``signup`` both share the same ``tr``. This ensures that they see each other's modifications to the database, and all of the changes that both of them make in sequence are made transactionally when the switch function returns. This compositional capability is very powerful.
|
||||
|
||||
Also note that, if an exception is raised, for example, in ``signup``, the exception is not caught by ``switch`` and so will be thrown to the calling function. In this case, the transaction object (owned by the decorator) is destroyed, automatically rolling back all database modifications, leaving the database completely unchanged by the half-executed function.
|
||||
|
||||
Are we done?
|
||||
------------
|
||||
|
||||
Yep, we're done. Fortunately, our UI team built an awesome UI while we were working on our back end, and we are ready to deploy. If you want to see this entire application in one place plus some multithreaded testing code to simulate concurrency, look at the :ref:`tutorial-appendix`, below.
|
||||
|
||||
Deploying and scaling
|
||||
---------------------
|
||||
|
||||
Since we store all state for this application in FoundationDB, deploying and scaling this solution up is impressively painless. Just run a web server, the UI, this back end, and point the whole thing at FoundationDB. We can run as many computers with this setup as we want, and they can all hit the database at the same time because of the transactional integrity of FoundationDB. Also, since all of the state in the system is stored in the database, any of these computers can fail without any lasting consequences.
|
||||
|
||||
Next steps
|
||||
==========
|
||||
|
||||
* See :doc:`data-modeling` for guidance on using tuple and subspaces to enable effective storage and retrieval of data.
|
||||
* See :doc:`developer-guide` for general guidance on development using FoundationDB.
|
||||
* See the :doc:`API References <api-reference>` for detailed API documentation.
|
||||
|
||||
.. _tutorial-appendix:
|
||||
|
||||
Appendix: SchedulingTutorial.py
|
||||
===============================
|
||||
|
||||
Here's the code for the scheduling tutorial::
|
||||
|
||||
import itertools
|
||||
|
||||
import fdb
|
||||
|
||||
fdb.api_version(510)
|
||||
|
||||
|
||||
####################################
|
||||
## Initialization ##
|
||||
####################################
|
||||
|
||||
# Data model:
|
||||
# ('attends', student, class) = ''
|
||||
# ('class', class_name) = seats_left
|
||||
|
||||
db = fdb.open()
|
||||
scheduling = fdb.directory.create_or_open(db, ('scheduling',))
|
||||
course = scheduling['class']
|
||||
attends = scheduling['attends']
|
||||
|
||||
@fdb.transactional
|
||||
def add_class(tr, c):
|
||||
tr[course.pack((c,))] = bytes(100)
|
||||
|
||||
# Generate 1,620 classes like '9:00 chem for dummies'
|
||||
levels = ['intro', 'for dummies', 'remedial', '101',
|
||||
'201', '301', 'mastery', 'lab', 'seminar']
|
||||
types = ['chem', 'bio', 'cs', 'geometry', 'calc',
|
||||
'alg', 'film', 'music', 'art', 'dance']
|
||||
times = [str(h) + ':00' for h in range(2, 20)]
|
||||
class_combos = itertools.product(times, types, levels)
|
||||
class_names = [' '.join(tup) for tup in class_combos]
|
||||
|
||||
@fdb.transactional
|
||||
def init(tr):
|
||||
del tr[scheduling.range(())] # Clear the directory
|
||||
for class_name in class_names:
|
||||
add_class(tr, class_name)
|
||||
|
||||
|
||||
####################################
|
||||
## Class Scheduling Functions ##
|
||||
####################################
|
||||
|
||||
|
||||
@fdb.transactional
|
||||
def available_classes(tr):
|
||||
return [course.unpack(k)[0] for k, v in tr[course.range(())]
|
||||
if int(v)]
|
||||
|
||||
|
||||
@fdb.transactional
|
||||
def signup(tr, s, c):
|
||||
rec = attends.pack((s, c))
|
||||
if tr[rec].present(): return # already signed up
|
||||
|
||||
seats_left = int(tr[course.pack((c,))])
|
||||
if not seats_left: raise Exception('No remaining seats')
|
||||
|
||||
classes = tr[attends.range((s,))]
|
||||
if len(list(classes)) == 5: raise Exception('Too many classes')
|
||||
|
||||
tr[course.pack((c,))] = bytes(seats_left - 1)
|
||||
tr[rec] = ''
|
||||
|
||||
|
||||
@fdb.transactional
|
||||
def drop(tr, s, c):
|
||||
rec = attends.pack((s, c))
|
||||
if not tr[rec].present(): return # not taking this class
|
||||
tr[course.pack((c,))] = bytes(int(tr[course.pack((c,))]) + 1)
|
||||
del tr[rec]
|
||||
|
||||
|
||||
@fdb.transactional
|
||||
def switch(tr, s, old_c, new_c):
|
||||
drop(tr, s, old_c)
|
||||
signup(tr, s, new_c)
|
||||
|
||||
####################################
|
||||
## Testing ##
|
||||
####################################
|
||||
|
||||
import random
|
||||
import threading
|
||||
|
||||
def indecisive_student(i, ops):
|
||||
student_ID = 's{:d}'.format(i)
|
||||
all_classes = class_names
|
||||
my_classes = []
|
||||
|
||||
for i in range(ops):
|
||||
class_count = len(my_classes)
|
||||
moods = []
|
||||
if class_count: moods.extend(['drop', 'switch'])
|
||||
if class_count < 5: moods.append('add')
|
||||
mood = random.choice(moods)
|
||||
|
||||
try:
|
||||
if not all_classes:
|
||||
all_classes = available_classes(db)
|
||||
if mood == 'add':
|
||||
c = random.choice(all_classes)
|
||||
signup(db, student_ID, c)
|
||||
my_classes.append(c)
|
||||
elif mood == 'drop':
|
||||
c = random.choice(my_classes)
|
||||
drop(db, student_ID, c)
|
||||
my_classes.remove(c)
|
||||
elif mood == 'switch':
|
||||
old_c = random.choice(my_classes)
|
||||
new_c = random.choice(all_classes)
|
||||
switch(db, student_ID, old_c, new_c)
|
||||
my_classes.remove(old_c)
|
||||
my_classes.append(new_c)
|
||||
except Exception as e:
|
||||
print e, "Need to recheck available classes."
|
||||
all_classes = []
|
||||
|
||||
def run(students, ops_per_student):
|
||||
threads = [
|
||||
threading.Thread(target=indecisive_student, args=(i, ops_per_student))
|
||||
for i in range(students)]
|
||||
for thr in threads: thr.start()
|
||||
for thr in threads: thr.join()
|
||||
print "Ran", students * ops_per_student, "transactions"
|
||||
|
||||
if __name__ == "__main__":
|
||||
init(db)
|
||||
print "initialized"
|
||||
run(10, 10)
|
|
@ -0,0 +1,33 @@
|
|||
#############
|
||||
Client Design
|
||||
#############
|
||||
|
||||
FoundationDB supports language bindings for application development using the ordered key-value store. The following documents cover use of the bindings, from getting started and design principles to best practices and data modeling. The latest changes are detailed in :doc:`release-notes`.
|
||||
|
||||
* :doc:`getting-started-mac` explains how to install a local FoundationDB server suitable for development on macOS.
|
||||
|
||||
* :doc:`getting-started-linux` explains how to install a local FoundationDB server suitable for development on Linux.
|
||||
|
||||
* :doc:`downloads` describes the FoundationDB packages available on Artifactory.
|
||||
|
||||
* :doc:`developer-guide` explains principles of application development applicable across all language bindings.
|
||||
|
||||
* :doc:`data-modeling` explains recommended techniques for representing application data in the key-value store.
|
||||
|
||||
* :doc:`api-general` contains information on FoundationDB clients applicable across all language bindings.
|
||||
|
||||
* :doc:`known-limitations` describes both long-term design limitations of FoundationDB and short-term limitations applicable to the current version.
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
:titlesonly:
|
||||
:hidden:
|
||||
|
||||
release-notes
|
||||
getting-started-mac
|
||||
getting-started-linux
|
||||
downloads
|
||||
developer-guide
|
||||
data-modeling
|
||||
api-general
|
||||
known-limitations
|
|
@ -0,0 +1,272 @@
|
|||
.. _command-line-interface:
|
||||
|
||||
######################
|
||||
Command Line Interface
|
||||
######################
|
||||
|
||||
.. include:: guide-common.rst.inc
|
||||
|
||||
FoundationDB comes with a command line interface tool called ``fdbcli``. This document describes basic usage of ``fdbcli`` and the commands it supports. The use of ``fdbcli`` while :doc:`configuring <configuration>` and :doc:`administering <administration>` FoundationDB clusters is described in more detail in the documents on those topics and will be referenced as appropriate.
|
||||
|
||||
.. _cli-invocation:
|
||||
|
||||
Invocation at the Command Line
|
||||
==============================
|
||||
|
||||
You can invoke ``fdbcli`` at the command line simply by typing it. For example::
|
||||
|
||||
user@host$ fdbcli
|
||||
Using cluster file `/etc/foundationdb/fdb.cluster'.
|
||||
|
||||
The database is available.
|
||||
|
||||
Welcome to the fdbcli. For help, type `help'.
|
||||
fdb>
|
||||
|
||||
This will result in ``fdbcli`` connecting to the :ref:`default cluster file <default-cluster-file>` (``/etc/foundationdb/fdb.cluster`` for Linux.) You can also specify a cluster file as an argument to ``fdbcli`` using the ``-C`` option. For further information, see :ref:`specifying-a-cluster-file`.
|
||||
|
||||
Commands within ``fdbcli``
|
||||
==========================
|
||||
|
||||
The following commands can be issued from within ``fdbcli`` at the internal ``fdb>`` prompt:
|
||||
|
||||
begin
|
||||
-----
|
||||
|
||||
The ``begin`` command begins a new transaction. By default, ``fdbcli`` operates in autocommit mode. All operations are performed in their own transaction and are automatically committed. By explicitly beginning a transaction, successive operations are all performed as part of a single transaction.
|
||||
|
||||
To commit the transaction, use the ``commit`` command. To discard the transaction, use the ``reset`` command.
|
||||
|
||||
clear
|
||||
-----
|
||||
|
||||
The ``clear`` command clears a key from the database. Its syntax is ``clear <KEY>``. This command succeeds even if the specified key is not present but may fail due to conflicts.
|
||||
|
||||
Note that :ref:`characters can be escaped <cli-escaping>` when specifying keys (or values) in ``fdbcli``.
|
||||
|
||||
clearrange
|
||||
----------
|
||||
|
||||
The ``clearrange`` command clears a range of keys from the database. Its syntax is ``clearrange <BEGINKEY> <ENDKEY>``. All keys between ``<BEGINKEY>`` (inclusive) and ``<ENDKEY>`` (exclusive) are cleared from the database. This command succeeds even if the specified range is empty but may fail due to conflicts.
|
||||
|
||||
Note that :ref:`characters can be escaped <cli-escaping>` when specifying keys (or values) in ``fdbcli``.
|
||||
|
||||
commit
|
||||
------
|
||||
|
||||
The ``commit`` command commits the current transaction. Any sets or clears executed after the start of the current transaction will be committed to the database. On success, the committed version number is displayed. If commit fails, the error is displayed and the transaction must be retried.
|
||||
|
||||
configure
|
||||
---------
|
||||
|
||||
The ``configure`` command changes the database configuration. Its syntax is ``configure [new] [single|double|triple|three_data_hall|three_datacenter] [ssd|memory] [proxies=<N>] [resolvers=<N>] [logs=<N>]``.
|
||||
|
||||
The ``new`` option, if present, initializes a new database with the given configuration rather than changing the configuration of an existing one. When ``new`` is used, both a redundancy mode and a storage engine must be specified.
|
||||
|
||||
redundancy mode
|
||||
^^^^^^^^^^^^^^^
|
||||
|
||||
Redundancy modes define storage requirements, required cluster size, and resilience to failure. The available redundancy modes are:
|
||||
|
||||
* ``single``
|
||||
* ``double``
|
||||
* ``triple``
|
||||
* ``three_datacenter``
|
||||
* ``three_data_hall``
|
||||
|
||||
For descriptions of redundacy modes, see :ref:`configuration-choosing-redundancy-mode`.
|
||||
|
||||
storage engine
|
||||
^^^^^^^^^^^^^^^
|
||||
|
||||
The storage engine is responsible for durably storing data. FoundationDB has two storage engines:
|
||||
|
||||
* ``ssd``
|
||||
* ``memory``
|
||||
|
||||
For descriptions of storage engines, see :ref:`configuration-storage-engine`.
|
||||
|
||||
process types
|
||||
^^^^^^^^^^^^^
|
||||
|
||||
A FoundationDB cluster employs server processes of different types. It automatically allocates these processes in default numbers appropriate for small-to-medium sized clusters.
|
||||
|
||||
For large clusters, you can manually set the allocated number of processes of a given type. Valid process types are:
|
||||
|
||||
* ``proxies``
|
||||
* ``resolvers``
|
||||
* ``logs``
|
||||
|
||||
Set the process using ``configure [proxies|resolvers|logs]=<N>``, where ``<N>`` is an integer greater than 0, or -1 to reset the value to its default.
|
||||
|
||||
For recommendations on appropriate values for process types in large clusters, see :ref:`configuration-large-cluster-performance`.
|
||||
|
||||
coordinators
|
||||
------------
|
||||
|
||||
The ``coordinators`` command is used to change cluster coordinators or description. Its syntax is ``coordinators auto|<ADDRESS...> [description=<DESC>]``.
|
||||
|
||||
Addresses may be specified as a list of IP:port pairs (such as ``coordinators 10.0.0.1:4000 10.0.0.2:4000 10.0.0.3:4000``). If addresses are specified, the coordinators will be set to them. An ``fdbserver`` process must be running on each of the specified addresses.
|
||||
|
||||
If ``auto`` is specified, coordinator addresses will be choosen automatically to support the configured redundancy level. (If the current set of coordinators are healthy and already support the configured redundancy level, nothing will be changed.)
|
||||
|
||||
For more information on setting coordinators, see :ref:`configuration-changing-coordination-servers`.
|
||||
|
||||
If ``description=<DESC>`` is specified, the description field in the cluster file is changed to ``<DESC>``, which must match ``[A-Za-z0-9_]+``.
|
||||
|
||||
For more information on setting the cluster description, see :ref:`configuration-setting-cluster-description`.
|
||||
|
||||
exclude
|
||||
-------
|
||||
|
||||
The ``exclude`` command excludes servers from the database. Its syntax is ``exclude <ADDRESS...>``. If no addresses are specified, the command provides the set of excluded servers.
|
||||
|
||||
For each IP address or IP:port pair in ``<ADDRESS...>``, the command adds the address to the set of excluded servers. It then waits until all database state has been safely moved off the specified servers.
|
||||
|
||||
For more information on excluding servers, see :ref:`removing-machines-from-a-cluster`.
|
||||
|
||||
exit
|
||||
----
|
||||
|
||||
The ``exit`` command exits ``fdbcli``.
|
||||
|
||||
get
|
||||
---
|
||||
|
||||
The ``get`` command fetches the value of a given key. Its syntax is ``get <KEY>``. It displays the value of ``<KEY>`` if ``<KEY>`` is present in the database and ``not found`` otherwise.
|
||||
|
||||
Note that :ref:`characters can be escaped <cli-escaping>` when specifying keys (or values) in ``fdbcli``.
|
||||
|
||||
getrange
|
||||
--------
|
||||
|
||||
The ``getrange`` command fetches key-value pairs in a range. Its syntax is ``getrange <BEGINKEY> [ENDKEY] [LIMIT]``. It displays up to ``<LIMIT>`` keys and values for keys between ``<BEGINKEY>`` (inclusive) and ``<ENDKEY>`` (exclusive). If ``<ENDKEY>`` is omitted, then the range will include all keys starting with ``<BEGINKEY>``. ``<LIMIT>`` defaults to 25 if omitted.
|
||||
|
||||
Note that :ref:`characters can be escaped <cli-escaping>` when specifying keys (or values) in ``fdbcli``.
|
||||
|
||||
getrangekeys
|
||||
------------
|
||||
|
||||
The ``getrangekeys`` command fetches keys in a range. Its syntax is ``getrangekeys <BEGINKEY> [ENDKEY] [LIMIT]``. It displays up to ``<LIMIT>`` keys for keys between ``<BEGINKEY>`` (inclusive) and ``<ENDKEY>`` (exclusive). If ``<ENDKEY>`` is omitted, then the range will include all keys starting with ``<BEGINKEY>``. ``<LIMIT>`` defaults to 25 if omitted.
|
||||
|
||||
Note that :ref:`characters can be escaped <cli-escaping>` when specifying keys (or values) in ``fdbcli``.
|
||||
|
||||
help
|
||||
----
|
||||
|
||||
The ``help`` command provides information on specific commands. Its syntax is ``help <TOPIC>``, where ``<TOPIC>`` is any of the commands in this section, ``escaping``, or ``options``. The latter two topics are described below:
|
||||
|
||||
.. _cli-escaping:
|
||||
|
||||
help escaping
|
||||
^^^^^^^^^^^^^
|
||||
|
||||
``help escaping`` provides the following information on escaping keys and values within ``fdbcli``:
|
||||
|
||||
When parsing commands, ``fdbcli`` considers a space to delimit individual tokens. To include a space in a single value, you may either enclose the token in quotation marks ``"``, prefix the space with a backslash ``\``, or encode the space as a hex character.
|
||||
|
||||
To include a literal quotation mark in a token, precede it with a backslash ``\"``.
|
||||
|
||||
To express a binary value, encode each byte as a two-digit hex value, preceded by ``\x`` (e.g. ``\x20`` for a space character, or ``\x0a\x00\x00\x00`` for a 32-bit, little-endian representation of the integer 10).
|
||||
|
||||
All keys and values are displayed by ``fdbcli`` with non-printable characters and spaces encoded as two-digit hex bytes.
|
||||
|
||||
.. _cli-options:
|
||||
|
||||
help options
|
||||
^^^^^^^^^^^^
|
||||
|
||||
The following options are available for use with the ``option`` command:
|
||||
|
||||
``ACCESS_SYSTEM_KEYS`` - Allows this transaction to read and modify system keys (those that start with the byte ``0xFF``).
|
||||
|
||||
``CAUSAL_READ_RISKY`` - The read version will be committed. It will usually will be the latest committed but might not be in the event of a fault or partition.
|
||||
|
||||
``CAUSAL_WRITE_RISKY`` - The transaction, if not self-conflicting, may be committed a second time after commit succeeds, in the event of a fault.
|
||||
|
||||
``INITIALIZE_NEW_DATABASE`` - This is a write-only transaction which sets the initial configuration.
|
||||
|
||||
``NEXT_WRITE_NO_WRITE_CONFLICT_RANGE`` - The next write performed on this transaction will not generate a write conflict range. As a result, other transactions which read the key(s) being modified by the next write will not conflict with this transaction. Care needs to be taken when using this option on a transaction that is shared between multiple threads. When setting this option, write conflict ranges will be disabled on the next write operation, regardless of what thread it is on.
|
||||
|
||||
``PRIORITY_BATCH`` - Specifies that this transaction should be treated as low priority and that default priority transactions should be processed first. Useful for doing batch work simultaneously with latency-sensitive work.
|
||||
|
||||
``PRIORITY_SYSTEM_IMMEDIATE`` - Specifies that this transaction should be treated as highest priority and that lower priority transactions should block behind this one. Use is discouraged outside of low-level tools.
|
||||
|
||||
``READ_AHEAD_DISABLE`` - Disables read-ahead caching for range reads. Under normal operation, a transaction will read extra rows from the database into cache if range reads are used to page through a series of data one row at a time (i.e. if a range read with a one row limit is followed by another one row range read starting immediately after the result of the first).
|
||||
|
||||
``READ_YOUR_WRITES_DISABLE`` - Reads performed by a transaction will not see any prior mutations that occured in that transaction, instead seeing the value which was in the database at the transaction's read version. This option may provide a small performance benefit for the client, but also disables a number of client-side optimizations which are beneficial for transactions which tend to read and write the same keys within a single transaction.
|
||||
|
||||
``RETRY_LIMIT`` - Set a maximum number of retries after which additional calls to ``onError`` will throw the most recently seen error code. Valid parameter values are ``[-1, INT_MAX]``. If set to -1, will disable the retry limit. Like all transaction options, the retry limit must be reset after a call to ``onError``. This behavior allows the user to make the retry limit dynamic.
|
||||
|
||||
``TIMEOUT`` - Set a timeout in milliseconds which, when elapsed, will cause the transaction automatically to be cancelled. Valid parameter values are ``[0, INT_MAX]``. If set to 0, will disable all timeouts. All pending and any future uses of the transaction will throw an exception. The transaction can be used again after it is reset. Like all transaction options, a timeout must be reset after a call to ``onError``. This behavior allows the user to make the timeouts dynamic.
|
||||
|
||||
|
||||
include
|
||||
-------
|
||||
|
||||
The ``include`` command permits previously excluded servers to rejoin the database. Its syntax is ``include all|<ADDRESS...>``.
|
||||
|
||||
If ``all`` is specified, the excluded servers list is cleared.
|
||||
|
||||
For each IP address or IP:port pair in ``<ADDRESS...>``, the command removes any matching exclusions from the excluded servers list. (A specified IP will match all ``IP:*`` exclusion entries).
|
||||
|
||||
For information on adding machines to a cluster, see :ref:`adding-machines-to-a-cluster`.
|
||||
|
||||
option
|
||||
------
|
||||
|
||||
The ``option`` command enables or disables an option. Its syntax is ``option <STATE> <OPTION> [ARG]``. Descriptions of :ref:`the available options <cli-options>` can be obtained within ``fdbcli`` by typing ``help options``.
|
||||
|
||||
If ``<STATE>`` is ``on``, then ``<OPTION>`` will be enabled with optional parameter ``<ARG>``, if required. If ``<STATE>`` is ``off``, then ``<OPTION>`` will be disabled.
|
||||
|
||||
If there is no active transaction, then the option will be applied to all operations as well as all subsequently created transactions (using ``begin``).
|
||||
|
||||
If there is an active transaction (one created with ``begin``), then enabled options apply only to that transaction. Options cannot be disabled on an active transaction.
|
||||
|
||||
Calling the ``option`` command with no parameters prints a list of all enabled options.
|
||||
|
||||
reset
|
||||
-----
|
||||
|
||||
The ``reset`` command resets the current transaction. Any sets or clears executed after the start of the active transaction will be discarded.
|
||||
|
||||
rollback
|
||||
--------
|
||||
|
||||
The ``rollback`` command rolls back the current transaction. The active transaction will be discarded, including any sets or clears executed since the transaction was started.
|
||||
|
||||
set
|
||||
---
|
||||
|
||||
The ``set`` command sets a value for a given key. Its syntax is ``set <KEY> <VALUE>``. If ``<KEY>`` is not already present in the database, it will be created.
|
||||
|
||||
Note that :ref:`characters can be escaped <cli-escaping>` when specifying keys (or values) in ``fdbcli``.
|
||||
|
||||
.. _cli-status:
|
||||
|
||||
status
|
||||
------
|
||||
|
||||
The ``status`` command reports the status of the FoundationDB cluster to which ``fdbcli`` is connected. Its syntax is ``status [minimal|details|json]``.
|
||||
|
||||
|
||||
If the cluster is down, this command will print a diagnostic which may be useful
|
||||
in figuring out what is wrong. If the cluster is running, this command will
|
||||
print cluster statistics.
|
||||
|
||||
status minimal
|
||||
^^^^^^^^^^^^^^
|
||||
|
||||
``status minimal`` will provide only an indication of whether the database is available.
|
||||
|
||||
status details
|
||||
^^^^^^^^^^^^^^
|
||||
|
||||
``status details`` will provide load information for individual workers.
|
||||
|
||||
For a detailed description of ``status`` output, see :ref:`administration-monitoring-cluster-status`.
|
||||
|
||||
status json
|
||||
^^^^^^^^^^^
|
||||
|
||||
``status json`` will provide the cluster status in its JSON format. For a detailed description of this format, see :doc:`mr-status`.
|
|
@ -0,0 +1,528 @@
|
|||
.. |multiplicative-suffixes| replace:: Sizes must be specified as a number of bytes followed by one of the multiplicative suffixes B=1, KB=10\ :sup:`3`, KiB=2\ :sup:`10`, MB=10\ :sup:`6`, MiB=2\ :sup:`20`, GB=10\ :sup:`9`, GiB=2\ :sup:`30`, TB=10\ :sup:`12`, or TiB=2\ :sup:`40`.
|
||||
|
||||
#############
|
||||
Configuration
|
||||
#############
|
||||
|
||||
.. include:: guide-common.rst.inc
|
||||
|
||||
This document contains *reference* information for configuring a new FoundationDB cluster. We recommend that you read this document before setting up a cluster for performance testing or production use. For *step-by-step instructions* to follow when setting up a cluster, see :doc:`building-cluster`.
|
||||
|
||||
.. note:: In FoundationDB, a "cluster" refers to one or more FoundationDB processes spread across one or more physical machines that together host a FoundationDB database.
|
||||
|
||||
To plan an externally accessible cluster, you need to understand some basic aspects of the system. You can start by reviewing the :ref:`system requirements <system-requirements>`, then how to :ref:`choose <configuration-choosing-coordination-servers>` and :ref:`change coordination servers <configuration-changing-coordination-servers>`. Next, you should look at the :ref:`configuration file <foundationdb-conf>`, which controls most other aspects of the system. Then, you should understand how to :ref:`choose a redundancy mode <configuration-choosing-redundancy-mode>` and :ref:`configure the storage subsystem <configuration-configuring-storage-subsystem>`. Finally, there are some configurations you can adjust to improve performance if your :ref:`cluster is large <configuration-large-cluster-performance>`.
|
||||
|
||||
.. _system-requirements:
|
||||
|
||||
System requirements
|
||||
===================
|
||||
|
||||
* One of the following 64-bit operating systems:
|
||||
|
||||
* A supported Linux distribution:
|
||||
|
||||
* RHEL/CentOS 6.x and 7.x
|
||||
* Ubuntu 12.04 or later (but see :ref:`Platform Issues for Ubuntu 12.x <platform-ubuntu-12>`)
|
||||
|
||||
* Or, an unsupported Linux distribution with:
|
||||
|
||||
* Kernel version between 2.6.33 and 3.0.x (inclusive) or 3.7 or greater
|
||||
* Works with .deb or .rpm packages
|
||||
|
||||
* Or, macOS 10.7 or later
|
||||
|
||||
.. warning:: The macOS version of the FoundationDB server is intended for use on locally accessible development machines only. Other uses are not supported.
|
||||
|
||||
* 4GB **ECC** RAM (per fdbserver process)
|
||||
* Storage
|
||||
|
||||
* SSDs are required when storing data sets larger than memory (using the ``ssd`` storage engine).
|
||||
* HDDs are OK when storing data sets smaller than memory (using the ``memory`` storage engine).
|
||||
* For more information, see :ref:`configuration-configuring-storage-subsystem`.
|
||||
|
||||
For a description of issues on particular platforms that affect the operation of FoundationDB, see :doc:`platforms`.
|
||||
|
||||
.. _configuration-choosing-coordination-servers:
|
||||
|
||||
Choosing coordination servers
|
||||
=============================
|
||||
|
||||
FoundationDB uses a set of *coordination servers* (or *coordinators* for short) to maximize the fault tolerance (and, in particular, the availability) of the cluster. The coordinators work by communicating and storing a small amount of shared state. If one or more machines are down or unable to communicate with the network, the cluster may become partitioned. In that event, FoundationDB selects the partition in which a majority of coordinators are reachable as the one that will remain available.
|
||||
|
||||
Any FoundationDB process can be used as a coordinator for any set of clusters. The performance impact of acting as a coordinator is negligible. The coordinators aren't involved at all in committing transactions.
|
||||
|
||||
Administrators should choose the number and physical location of coordinators to maximize fault tolerance. Most configurations should follow these guidelines:
|
||||
|
||||
* Choose an odd number of coordinators.
|
||||
* Use enough coordinators to complement the :ref:`redundancy mode <configuration-choosing-redundancy-mode>` of the cluster, often 3 or 5.
|
||||
* Place coordinators in different racks, circuits, or datacenters with independence of failure.
|
||||
* It is OK to place coordinators in distant datacenters; in normal operation the latency to a coordinator does not affect system latency.
|
||||
|
||||
The set of coordinators is stored on each client and server in the :ref:`cluster file <foundationdb-cluster-file>`.
|
||||
|
||||
.. _configuration-changing-coordination-servers:
|
||||
|
||||
Changing coordination servers
|
||||
=============================
|
||||
|
||||
It is sometimes necessary to change the set of coordinators servers. You may want to do so because of changing network conditions, machine failures, or just planning adjustments. You can change coordinators using an automated ``fdbcli`` command. FoundationDB will maintain ACID guarantees during the changes.
|
||||
|
||||
You can change coordinators when the following conditions are met:
|
||||
|
||||
* a majority of the current coordinators are available;
|
||||
* all of the new coordinators are available; and
|
||||
* client and server cluster files are writable.
|
||||
|
||||
``fdbcli`` supports a ``coordinators`` command to specify the new list of coordinators::
|
||||
|
||||
user@host$ fdbcli
|
||||
Using cluster file `/etc/foundationdb/fdb.cluster'.
|
||||
|
||||
The database is available.
|
||||
|
||||
Welcome to the fdbcli. For help, type `help'.
|
||||
fdb> coordinators 10.0.4.1:4500 10.0.4.2:4500 10.0.4.3:4500
|
||||
Coordinators changed
|
||||
|
||||
After running this command, you can check that it completed successfully by using the ``status details`` command::
|
||||
|
||||
fdb> status details
|
||||
|
||||
Configuration:
|
||||
Redundancy mode - triple
|
||||
Storage engine - ssd
|
||||
Coordinators - 3
|
||||
|
||||
Cluster:
|
||||
FoundationDB processes - 3
|
||||
Machines - 3
|
||||
Memory availability - 4.1 GB per process on machine with least available
|
||||
Fault Tolerance - 0 machines
|
||||
Server time - Wed Oct 8 14:41:34 2014
|
||||
|
||||
Data:
|
||||
Replication health - Healthy
|
||||
Moving data - 0.000 GB
|
||||
Sum of key-value sizes - 0 MB
|
||||
|
||||
Operating space:
|
||||
Storage server - 1.0 GB free on most full server
|
||||
Transaction log - 1.0 GB free on most full server
|
||||
|
||||
Workload:
|
||||
Read rate - 2 Hz
|
||||
Write rate - 0 Hz
|
||||
Transactions started - 2 Hz
|
||||
Transactions committed - 0 Hz
|
||||
Conflict rate - 0 Hz
|
||||
|
||||
Process performance details:
|
||||
10.0.4.1:4500 ( 3% cpu; 2% machine; 0.004 Gbps; 0% disk; 2.5 GB / 4.1 GB RAM )
|
||||
10.0.4.2:4500 ( 1% cpu; 2% machine; 0.004 Gbps; 0% disk; 2.5 GB / 4.1 GB RAM )
|
||||
10.0.4.3:4500 ( 1% cpu; 2% machine; 0.004 Gbps; 0% disk; 2.5 GB / 4.1 GB RAM )
|
||||
|
||||
Coordination servers:
|
||||
10.0.4.1:4500
|
||||
10.0.4.2:4500
|
||||
10.0.4.3:4500
|
||||
|
||||
Client time: Thu Nov 20 09:50:45 2014
|
||||
|
||||
The list of coordinators verifies that the coordinator change succeeded. A few things might cause this process to not go smoothly:
|
||||
|
||||
* If any of the new coordination servers fail before you run the ``coordinators`` command, the change will not occur, and the database will continue to use the old coordinators.
|
||||
* If a majority of the new coordination servers fail during or after the change, the database will not be available until a majority of them are available again.
|
||||
* If a majority of the old coordination servers fail before the change is completed, the database will be unavailable until a majority of them are available again. Consequently, the ``coordinators`` command cannot be used to repair a database which is unavailable because its coordinators are down.
|
||||
|
||||
Once the change is complete, database servers and clients need to communicate with the old coordinators in order to update their cluster file to point to the new coordinators. (Each database server and client will physically re-write their cluster file to reference the new coordinators.) In most cases this process occurs automatically within a matter of seconds.
|
||||
|
||||
If some servers or clients are unable to write to their cluster file or are disconnected during the change of coordinators (and too many of the old coordinators become unavailable before they come back up), an administrator will need to manually copy the new cluster file to each affected machine and restart the database server or client, as if adding a new server or client to the cluster.
|
||||
|
||||
The ``coordinators`` command also supports a convenience option, ``coordinators auto``, that automatically selects a set of coordination servers appropriate for the redundancy mode::
|
||||
|
||||
user@host1$ fdbcli
|
||||
Using cluster file `/etc/foundationdb/fdb.cluster'.
|
||||
|
||||
The database is available.
|
||||
|
||||
Welcome to the fdbcli. For help, type `help'.
|
||||
fdb> coordinators auto
|
||||
Coordinators changed
|
||||
|
||||
``coordinators auto`` will not make any changes if the current coordinators are all available and support the current redundancy level.
|
||||
|
||||
.. note:: |coordinators-auto|
|
||||
|
||||
.. _configuration-setting-cluster-description:
|
||||
|
||||
Changing the cluster description
|
||||
================================
|
||||
|
||||
A cluster is named by the ``description`` field recorded in the ``fdb.cluster`` file (see :ref:`cluster-file-format`). For convenience of reference, you may want to change the ``description`` if you operate more than one cluster on the same machines. (Each cluster must be uniquely identified by the combination of ``description`` and ``ID``.)
|
||||
|
||||
You can change the ``description`` using the ``coordinators`` command within ``fdbcli``::
|
||||
|
||||
fdb> coordinators description=cluster_b
|
||||
Coordination state changed
|
||||
|
||||
You can also combine a change of ``description`` with :ref:`changing coordinators <configuration-changing-coordination-servers>`, whether by listing the coordinators or with ``coordinators auto``::
|
||||
|
||||
fdb> coordinators auto description=cluster_c
|
||||
Coordination state changed
|
||||
|
||||
.. _foundationdb-conf:
|
||||
|
||||
The configuration file
|
||||
======================
|
||||
|
||||
The ``fdbserver`` server process is run and monitored on each server by the ``fdbmonitor`` :ref:`daemon <administration_fdbmonitor>`. ``fdbmonitor`` and ``fdbserver`` itself are controlled by the ``foundationdb.conf`` file located at:
|
||||
|
||||
* ``/etc/foundationdb/foundationdb.conf`` on Linux
|
||||
* ``/usr/local/etc/foundationdb/foundationdb.conf`` on macOS
|
||||
|
||||
The ``foundationdb.conf`` file contains several sections, detailed below. Note that the presence of individual ``[fdbserver.<ID>]`` sections actually cause ``fdbserver`` processes to be run.
|
||||
|
||||
.. note:: |conf-file-change-detection|
|
||||
|
||||
.. warning:: Do not attempt to stop FoundationDB services by removing the configuration file. Removing the file will not stop the services; it will merely remove your ability to control them in the manner supported by FoundationDB. During normal operation, services can be stopped by commenting out or removing the relevant sections of the configuration file. You can also disable a service at the operating system level or by removing the software.
|
||||
|
||||
``[fdbmonitor]`` section
|
||||
------------------------
|
||||
|
||||
.. code-block:: ini
|
||||
|
||||
## foundationdb.conf
|
||||
##
|
||||
## Configuration file for FoundationDB server processes
|
||||
## Full documentation is available in the FoundationDB Administration document.
|
||||
|
||||
[fdbmonitor]
|
||||
restart_delay = 60
|
||||
user = foundationdb
|
||||
group = foundationdb
|
||||
|
||||
Contains basic configuration parameters of the ``fdbmonitor`` process. ``restart_delay`` specifies the number of seconds that ``fdbmonitor`` waits before restarting a failed process. ``user`` and ``group`` are used on Linux systems to control the privilege level of child processes.
|
||||
|
||||
``[general]`` section
|
||||
-----------------------
|
||||
|
||||
.. code-block:: ini
|
||||
|
||||
[general]
|
||||
cluster_file = /etc/foundationdb/fdb.cluster
|
||||
|
||||
Contains settings applicable to all processes (e.g. fdbserver, backup_agent). The main setting of interest is ``cluster_file``, which specifies the location of the cluster file. This file and the directory that contains it must be writable by all processes (i.e. by the user or group set in the [fdbmonitor] section).
|
||||
|
||||
.. _foundationdb-conf-fdbserver:
|
||||
|
||||
``[fdbserver]`` section
|
||||
-----------------------
|
||||
|
||||
.. code-block:: ini
|
||||
|
||||
## Default parameters for individual fdbserver processes
|
||||
[fdbserver]
|
||||
command = /usr/sbin/fdbserver
|
||||
public_address = auto:$ID
|
||||
listen_address = public
|
||||
datadir = /var/lib/foundationdb/data/$ID
|
||||
logdir = /var/log/foundationdb
|
||||
# logsize = 10MiB
|
||||
# maxlogssize = 100MiB
|
||||
# machine_id =
|
||||
# datacenter_id =
|
||||
# class =
|
||||
# memory = 8GiB
|
||||
# storage_memory = 1GiB
|
||||
|
||||
Contains default parameters for all fdbserver processes on this machine. These same options can be overridden for individual processes in their respective ``[fdbserver.<ID>]`` sections. In this section, the ID of the individual fdbserver can be substituted by using the ``$ID`` variable in the value. For example, ``public_address = auto:$ID`` makes each fdbserver listen on a port equal to its ID.
|
||||
|
||||
.. note:: |multiplicative-suffixes|
|
||||
|
||||
* ``command``: The location of the ``fdbserver`` binary.
|
||||
* ``public_address``: The publicly visible IP:Port of the process. If ``auto``, the address will be the one used to communicate with the coordination servers.
|
||||
* ``listen_address``: The IP:Port that the server socket should bind to. If ``public``, it will be the same as the public_address.
|
||||
* ``datadir``: A writable directory (by root or by the user set in the [fdbmonitor] section) where persistent data files will be stored.
|
||||
* ``logdir``: A writable directory (by root or by the user set in the [fdbmonitor] section) where FoundationDB will store log files.
|
||||
* ``logsize``: Roll over to a new log file after the current log file reaches the specified size. The default value is 10MiB.
|
||||
* ``maxlogssize``: Delete the oldest log file when the total size of all log files exceeds the specified size. If set to 0B, old log files will not be deleted. The default value is 100MiB.
|
||||
* ``machine_id``: Machine identifier key. Processes that share a key are considered non-unique for the purposes of data replication. By default, processes on a machine determine a unique key to share. This does not generally need to be set. The ID can be up to 16 hexadecimal digits.
|
||||
* ``datacenter_id``: Data center identifier key. All processes physically located in a data center should share the id. If unset, defaults to a special "default" data center. If you are depending on data center based replication this must be set on all processes. The ID can be up to 16 hexadecimal digits.
|
||||
* ``class``: Machine class specifying the roles that will be taken in the cluster. Valid options are ``storage``, ``transaction``, ``resolution``. See :ref:`configuration-large-cluster-performance` for machine class recommendations in large clusters.
|
||||
* ``memory``: Maximum memory used by the process. The default value is 8GiB. When specified without a unit, MiB is assumed. This parameter does not change the memory allocation of the program. Rather, it sets a hard limit beyond which the process will kill itself and be restarted. The default value of 8GiB is double the intended memory usage in the default configuration (providing an emergency buffer to deal with memory leaks or similar problems). It is *not* recommended to decrease the value of this parameter below its default value. It may be *increased* if you wish to allocate a very large amount of storage engine memory or cache. In particular, when the ``storage_memory`` parameter is increased, the ``memory`` parameter should be increased by an equal amount.
|
||||
* ``storage_memory``: Maximum memory used for data storage. This paramenter is used *only* with memory storage engine, not the ssd storage engine. The default value is 1GiB. When specified without a unit, MB is assumed. Clusters will be restricted to using this amount of memory per process for purposes of data storage. Memory overhead associated with storing the data is counted against this total. If you increase the ``storage_memory``, you should also increase the ``memory`` parameter by the same amount.
|
||||
|
||||
``[fdbserver.<ID>]`` section(s)
|
||||
---------------------------------
|
||||
|
||||
.. code-block:: ini
|
||||
|
||||
## An individual fdbserver process with id 4500
|
||||
## Parameters set here override defaults from the [fdbserver] section
|
||||
[fdbserver.4500]
|
||||
|
||||
Each section of this type represents an ``fdbserver`` process that will be run. IDs cannot be repeated. Frequently, an administrator will choose to run one ``fdbserver`` per CPU core. Parameters set in this section apply to only a single fdbserver process, and overwrite the defaults set in the ``[fdbserver]`` section. Note that by default, the ID specified in this section is also used as the network port and the data directory.
|
||||
|
||||
Backup agent sections
|
||||
----------------------
|
||||
|
||||
.. code-block:: ini
|
||||
|
||||
[backup_agent]
|
||||
command = /usr/lib/foundationdb/backup_agent/backup_agent
|
||||
|
||||
[backup_agent.1]
|
||||
|
||||
These sections run and configure the backup agent process used for :doc:`point-in-time backups <backups>` of FoundationDB. These don't usually need to be modified. The structure and functionality is similar to the ``[fdbserver]`` and ``[fdbserver.<ID>]`` sections.
|
||||
|
||||
|
||||
.. _configuration-choosing-redundancy-mode:
|
||||
|
||||
Choosing a redundancy mode
|
||||
==========================
|
||||
|
||||
FoundationDB supports a variety of redundancy modes. These modes define storage requirements, required cluster size, and resilience to failure. To change the redundancy mode, use the ``configure`` command ``fdbcli``. For example::
|
||||
|
||||
user@host$ fdbcli
|
||||
Using cluster file `/etc/foundationdb/fdb.cluster'.
|
||||
|
||||
The database is available.
|
||||
|
||||
Welcome to the fdbcli. For help, type `help'.
|
||||
fdb> configure double
|
||||
Configuration changed.
|
||||
|
||||
The available redundancy modes are described below.
|
||||
|
||||
Single datacenter modes
|
||||
-----------------------
|
||||
|
||||
+------------------------------+--+-----------------+-----------------+----------------+
|
||||
| | | single | double | triple |
|
||||
+==============================+==+=================+=================+================+
|
||||
| Best for | | 1-2 machines | 3-4 machines | 5+ machines |
|
||||
+------------------------------+--+-----------------+-----------------+----------------+
|
||||
| Replication | | 1 copy | 2 copy | 3 copy |
|
||||
+------------------------------+--+-----------------+-----------------+----------------+
|
||||
| # live machines | | | | |
|
||||
| to make progress | | 1 | 2 | 3 |
|
||||
+------------------------------+--+-----------------+-----------------+----------------+
|
||||
| Minimum # of machines | | | | |
|
||||
| for fault tolerance | | impossible | 3 | 4 |
|
||||
+------------------------------+--+-----------------+-----------------+----------------+
|
||||
| Ideal # of | | | | |
|
||||
| coordination servers | | 1 | 3 | 5 |
|
||||
+------------------------------+--+-----------------+-----------------+----------------+
|
||||
| # simultaneous failures | | | | |
|
||||
| after which data may be lost | | any machine | 2+ machines | 3+ machines |
|
||||
+------------------------------+--+-----------------+-----------------+----------------+
|
||||
|
||||
In the three single datacenter redundancy modes, FoundationDB replicates data across the required number of machines in the cluster, but without aiming for datacenter redundancy. Although machines may be placed in more than one datacenter, the cluster will not be tolerant of datacenter-correlated failures.
|
||||
|
||||
FoundationDB will never use processes on the same machine for the replication of any single piece of data. For this reason, references to the number of "machines" in the summary table above are the number of physical machines, not the number of ``fdbserver`` processes.
|
||||
|
||||
``single`` mode
|
||||
*(best for 1-2 machines)*
|
||||
|
||||
FoundationDB does not replicate data and needs only one physical machine to make progress. Because data is not replicated, the database is not fault-tolerant. This mode is recommended for testing on a single development machine. (``single`` mode *will* work with clusters of two or more computers and will partition data for increased performance but the cluster will not tolerate the loss of any machines.)
|
||||
|
||||
``double`` mode
|
||||
*(best for 3-4 machines)*
|
||||
|
||||
FoundationDB replicates data to two machines, so two or more machines are required to make progress. The loss of one machine can be survived without losing data, but if only two machines were present originally, the database will be unavailable until the second machine is restored, another machine is added, or the replication mode is changed.
|
||||
|
||||
.. note:: In double redundancy mode, we recommend using three coordinators. In a two machine double redundancy cluster without a third coordinator, the coordination state has no redundancy. Losing either machine will make the database unavailable, and recoverable only by an unsafe manual recovery of the coordination state.
|
||||
|
||||
.. _configuration-redundancy-mode-triple:
|
||||
|
||||
``triple`` mode
|
||||
*(best for 5+ machines)*
|
||||
|
||||
FoundationDB replicates data to three machines, and at least three available machines are required to make progress. This is the recommended mode for a cluster of five or more machines in a single datacenter.
|
||||
|
||||
``three_data_hall`` mode
|
||||
FoundationDB replicates data to three machines, and at least three available machines are required to make progress. Every piece of data that has been committed to storage servers
|
||||
will be replicated onto three different data halls, and the cluster will
|
||||
remain available after losing a single data hall and one machine in another
|
||||
data hall.
|
||||
|
||||
Datacenter-aware mode
|
||||
---------------------
|
||||
|
||||
In addition to the more commonly used modes listed above, this version of FoundationDB has support for redundancy across multiple datacenters. Although data will always be triple replicated in this mode, it may not be replicated across all datacenters.
|
||||
|
||||
.. note:: When using the datacenter-aware mode, all ``fdbserver`` processes should be passed a valid datacenter identifier on the command line.
|
||||
|
||||
``three_datacenter`` mode
|
||||
*(for 5+ machines in 3 datacenters)*
|
||||
|
||||
FoundationDB attempts to replicate data across two datacenters and will stay up with only two available. Data is triple replicated. For maximum availability, you should use five coordination servers: two in two of the datacenters and one in the third datacenter.
|
||||
|
||||
Changing redundancy mode
|
||||
------------------------
|
||||
|
||||
You can change the redundancy mode of a database at any time using ``fdbcli``. For example, after adding more machines to a cluster that was initially configured in ``single`` mode, you might increase the redundancy mode to ``triple``. After the change, FoundationDB will replicate your data accordingly (and remain available while it does so).
|
||||
|
||||
If a database is unavailable because it has too few machines to function in its current redundancy mode, you can restore it to operation by changing the redundancy mode to one with lower requirements. For example, if a database configured in ``triple`` mode in a 4 server cluster loses two servers, it will stop operating because the configured redundancy is unachievable. It will start working immediately if you configure it to ``double`` mode. Consider the consequences of reducing the redundancy level carefully before doing so. If you reduce or eliminate redundancy and there are further hardware failures, your data could be lost permanently. The best option, if available, is to add new hardware to the cluster to restore it to its minimum operating size.
|
||||
|
||||
Similarly, if you change the redundancy mode to a mode that cannot make progress with currently available hardware (for example, to ``triple`` when there are only two machines available), the database will immediately become unavailable. Changing the mode back or adding the necessary machines to the cluster will restore it to operation.
|
||||
|
||||
.. _configuration-configuring-storage-subsystem:
|
||||
|
||||
Configuring the storage subsystem
|
||||
=================================
|
||||
|
||||
.. _configuration-storage-engine:
|
||||
|
||||
Storage engines
|
||||
---------------
|
||||
|
||||
A storage engine is the part of the database that is responsible for storing data to disk. FoundationDB has two storage engines options, ``ssd`` and ``memory``.
|
||||
|
||||
For both storage engines, FoundationDB commits transactions to disk with the number of copies indicated by the redundancy mode before reporting them committed. This procedure guarantees the *durability* needed for full ACID compliance. At the point of the commit, FoundationDB may have only *logged* the transaction, deferring the work of updating the disk representation. This deferral has significant advantages for latency and burst performance. Due to this deferral, it is possible for disk space usage to continue increasing after the last commit.
|
||||
|
||||
To change the storage engine, use the ``configure`` command of ``fdbcli``. For example::
|
||||
|
||||
user@host$ fdbcli
|
||||
Using cluster file `/etc/foundationdb/fdb.cluster'.
|
||||
|
||||
The database is available.
|
||||
|
||||
Welcome to the fdbcli. For help, type `help'.
|
||||
fdb> configure ssd
|
||||
Configuration changed.
|
||||
|
||||
.. note:: The :ref:`storage space requirements <storage-space-requirements>` of each storage engine are discussed in the Administration Guide.
|
||||
|
||||
.. _configuration-storage-engine-ssd:
|
||||
|
||||
``ssd`` storage engine
|
||||
*(optimized for SSD storage)*
|
||||
|
||||
Data is stored on disk in B-tree data structures optimized for use on :ref:`SSDs <ssd-info>`. This engine is more robust when the right disk hardware is available, as it can store large amounts of data.
|
||||
|
||||
The ``ssd`` engine recovers storage space on a deferred basis after data is deleted from the database. Following a deletion, the engine slowly shuffles empty B-tree pages to the end of the database file and truncates them. This activity is given a low priority relative to normal database operations, so there may be a delay before the database reaches minimum size.
|
||||
|
||||
.. note :: The ``ssd`` storage engine can be slow to return space from deleted data to the filesystem when the number of free pages is very large. This condition can arise when you have deleted most of the data in your database or greatly increased the number of processes in your cluster. In this condition, the database is still able to reuse the space, but the disk files may remain large for an extended period.
|
||||
|
||||
.. note :: Because the database is able to reuse the space, action is required only if your application requires the space for some non-database purpose. In this case, you can reclaim space by :ref:`excluding affected server processes<removing-machines-from-a-cluster>` (either singly or a few at a time) and then including them again. Excluding removes the storage server and its data files completely, and when the server is included again, it uses a fresh database file.
|
||||
|
||||
Because this engine is tuned for SSDs, it may have poor performance or even availability problems when run on weaker I/O subsystems such as spinning disks or network attached storage.
|
||||
|
||||
.. _configuration-storage-engine-memory:
|
||||
|
||||
``memory`` storage engine
|
||||
*(optimized for small databases)*
|
||||
|
||||
Data is stored in memory and logged to disk. In this storage engine, all data must be resident in memory at all times, and all reads are satisfied from memory. Additionally, all writes are saved to disk to ensure that data is always fully durable. This engine works well with storage subsystems, such as spinning disks, that have good sequential write performance but poor random I/O performance.
|
||||
|
||||
By default, each process using the memory storage engine is limited to storing 1 GB of data (including overhead). This limit can be changed using the ``storage_memory`` parameter as documented in :ref:`foundationdb.conf <foundationdb-conf>`.
|
||||
|
||||
When using the ``memory`` engine, especially with a larger memory limit, it can take some time (seconds to minutes) for a storage machine to start up. This is because it needs to reconstruct its in-memory data structure from the logs stored on disk.
|
||||
|
||||
Storage locations
|
||||
---------------------
|
||||
|
||||
Each FoundationDB process stores its state in a subdirectory of the directory supplied to it by the ``datadir`` configuration parameter in the :ref:`configuration file <foundationdb-conf>`. By default this directory is:
|
||||
|
||||
* ``/var/lib/foundationdb/data/<port>`` on Linux.
|
||||
* ``/usr/local/foundationdb/data/<port>`` on macOS.
|
||||
|
||||
.. warning:: You must always use different ``datadir`` settings for different processes!
|
||||
|
||||
To use multiple disks for performance and capacity improvements, configure the ``datadir`` of individual ``fdbserver`` processes to point to different disks, as follows:
|
||||
|
||||
1. :ref:`Stop <administration-running-foundationdb>` the FoundationDB server processes on a machine.
|
||||
2. Transfer the contents of the state subdirectories to new directories on the desired disks, and make sure the directories are writable by the ``fdbserver``.
|
||||
3. Edit :ref:`foundationdb.conf <foundationdb-conf>` to reference the new directories. For example::
|
||||
|
||||
[fdbserver.4500]
|
||||
datadir=/ssd1/foundationdb/4500
|
||||
|
||||
[fdbserver.4501]
|
||||
datadir=/ssd2/foundationdb/4501
|
||||
|
||||
4. :ref:`Start <administration-running-foundationdb>` the FoundationDB server
|
||||
|
||||
.. _ssd-info:
|
||||
|
||||
SSD considerations
|
||||
------------------
|
||||
|
||||
FoundationDB is designed to work with SSDs (solid state drives). SSD performance is both highly variable and hard to get good data on from public sources. Since OS configuration, controller, firmware, and driver can each have a large impact on performance (up to an order of magnitude), drives should be tested to make sure the advertised performance characteristics are being achieved. Tests should be conducted with the same load profile as FoundationDB, which typically creates a workload of mixed small reads and writes at a high queue depth.
|
||||
|
||||
SSDs are unlike HDDs in that they fail after a certain amount of wear. FoundationDB will use all of the SSDs in the cluster at approximately the same rate. Therefore, if a cluster is started on brand new SSDs, all of them will fail due to wear at approximately the same time. To prevent this occurrence, rotate new drives into the cluster on a continuous basis and monitor their S.M.A.R.T. attributes to check wear levels.
|
||||
|
||||
Filesystem
|
||||
------------
|
||||
|
||||
FoundationDB recommends the ext4 filesystem. (However, see :ref:`Platform Issues for Ubuntu 12.x <platform-ubuntu-12>` for an issue relating to ext4 on that platform.)
|
||||
|
||||
.. warning::
|
||||
* FoundationDB requires filesystem support for kernel asynchronous I/O.
|
||||
* Older filesystems such as ext3 lack important features for operating safely and efficiently with an SSD.
|
||||
* Copy-on-write type filesystems (such as Btrfs) will likely have poor performance with FoundationDB.
|
||||
|
||||
|
||||
Ext4 filesystems should be mounted with mount options ``default,noatime,discard``.
|
||||
|
||||
.. note ::
|
||||
The ``noatime`` option disables updating of access times when reading files, an unneeded feature for FoundationDB that increases write activity on the disk. The discard option enables `TRIM <http://en.wikipedia.org/wiki/TRIM>`_ support, allowing the operating system to efficiently inform the SSD of erased blocks, maintaining high write speed and increasing drive lifetime.
|
||||
|
||||
Durability and caching
|
||||
-------------------------
|
||||
|
||||
FoundationDB relies on the correct operation of ``fsync()`` to ensure the durability of transactions in the event of power failures. The combination of the file system, mount options, hard disk controller, and hard disk all need to work together properly to ensure that the operating system is correct when it reports to FoundationDB that data is safely stored.
|
||||
|
||||
If you are unsure about your hardware setup and need to ensure that data durability is maintained in all possible situations, we recommend that you test your hardware.
|
||||
|
||||
Disk partitioning
|
||||
-------------------
|
||||
|
||||
.. note:: Modern Linux distributions already adhere to the suggestions below for newly formatted disks. If your system is recently installed this section can be skipped.
|
||||
|
||||
For performance reasons, it is critical that partitions on SSDs in Linux be aligned with the Erase Block Size (EBS) of the drive. The value of the EBS is vendor specific, but a value of 1024 KiB is both greater than or equal to and a multiple of any current EBS, so is safe to use with any SSD. (Defaulting to 1024 KiB is how Windows 7 and recent Linux distributions guarantee efficient SSD operation.)
|
||||
To verify that the start of the partition is aligned with the EBS of the drive use the ``fdisk`` utility::
|
||||
|
||||
user@host$ sudo fdisk -l /dev/sdb
|
||||
|
||||
Disk /dev/sdb: 128.0 GB, 128035676160 bytes
|
||||
30 heads, 63 sectors/track, 132312 cylinders, total 250069680 sectors
|
||||
Units = sectors of 1 * 512 = 512 bytes
|
||||
Sector size (logical/physical): 512 bytes / 512 bytes
|
||||
I/O size (minimum/optimal): 512 bytes / 512 bytes
|
||||
Disk identifier: 0xe5169faa
|
||||
|
||||
Device Boot Start End Blocks Id System
|
||||
/dev/sdb1 2048 250069679 125033816 83 Linux
|
||||
|
||||
This output shows a properly partitioned disk in Ubuntu 12.04.
|
||||
|
||||
When creating a partition for use with FoundationDB using the standard Linux fdisk utility, DOS compatibility mode should be disabled (``c`` command in interactive mode) and display units should be set to sectors (``u`` command in interactive mode), rather than cylinders. Again, this is the default mode in recent Linux distributions but must be configured on older systems.
|
||||
|
||||
For an SSD with a single partition, the partition should typically begin at sector 2048 (512 byte sectors yields 1024 KiB alignment).
|
||||
|
||||
.. _configuration-large-cluster-performance:
|
||||
|
||||
Large cluster performance
|
||||
=========================
|
||||
|
||||
.. note:: For small-to-medium clusters (32 processes or fewer), FoundationDB's default behavior generally provides the best performance, and you should ignore this section. Further configuration is recommended only for large clusters (> 32 processes) or if you have special latency requirements.
|
||||
|
||||
In a FoundationDB cluster, each of the ``fdbserver`` processes perform different tasks. FoundationDB automatically assigns each machine in the cluster a ``class`` that specifies the tasks it will perform. For large clusters, FoundationDB also provides the ability to tune cluster performance by manually assigning the ``class`` of some machines.
|
||||
|
||||
To assign machine classes manually, set the ``class=transaction`` parameter in :ref:`foundationdb.conf <foundationdb-conf>` on all processes on selected machines. The ratio of total processes to ``class``-specified processes should be about 8:1. For example, if you have 64 processes on 16 machines, you would set ``class=transaction`` for 8 processes on 2 machines.
|
||||
|
||||
For large clusters with high write workloads (greater than 100,000 writes/second), you can increase performance by increasing the number of proxies, resolvers, and log servers. These are set using ``fdbcli`` in equal (1:1:1) proportions among the processes on machines set to ``class=transaction``.
|
||||
|
||||
For example, if you have 384 processes on 96 machines, and a workload greater than 100,000 writes per second, you would set ``class=transaction`` for 48 processes on 12 machines. Of the latter, you would set 16 processes on 4 machines each as *proxies*, *resolvers*, and *log servers*. To do so, you would issue the ``fdbcli`` commands::
|
||||
|
||||
fdb> configure proxies=16
|
||||
fdb> configure resolvers=16
|
||||
fdb> configure logs=16
|
||||
|
||||
.. note:: In the present release, the default value for proxies and log servers is 3 and for resolvers is 1. The ratios discussed above are guidelines; regardless of them, you should not set the value of a process type to less than its default. For example, on clusters ranging from 36 to 60 processes with high write workloads, you may choose to increase the number of resolvers to 2. In this case, you would nevertheless leave the number of proxies and log servers at their default values of 3.
|
||||
|
||||
.. warning:: The conflict-resolution algorithm used by FoundationDB is conservative: it guarantees that no conflicting transactions will be committed, but it may fail to commit some transactions that theoretically could have been. The effects of this conservatism may increase as you increase the number of proxies. It is therefore important to employ the recommended techniques for :ref:`minimizing conflicts <developer-guide-transaction-conflicts>` when increasing the number of proxies.
|
||||
|
||||
You can contact us on the `community forums <https://forums.foundationdb.org>`_ if you are interested in more details or if you are benchmarking or performance-tuning on large clusters. Also see our `performance benchmarks </performance>`_ for a baseline of how a well-configured cluster should perform.
|
|
@ -0,0 +1,30 @@
|
|||
###########
|
||||
Consistency
|
||||
###########
|
||||
|
||||
Two meanings of consistency
|
||||
===========================
|
||||
|
||||
The concept of *consistency* comes up a great deal in the context of distributed databases. However, the term is used in one sense in the context of the ACID properties and in another in the context of the CAP theorem. The two meanings are often confused.
|
||||
|
||||
* The "C" in ACID refers to the property that data remains within an application's `integrity constraints <http://en.wikipedia.org/wiki/Integrity_constraints>`_. (For example, the constraint that some data and its index are consistent with respect to each other.)
|
||||
|
||||
* The "C" in CAP relates to a `consistency model <http://en.wikipedia.org/wiki/Consistency_model>`_, which describes the conditions under which write operations from one client become visible to other clients. (One example is the eventual consistency model, in which writes are expected to be consistent across replicas after a sufficiently long period.)
|
||||
|
||||
Integrity constraints relate to the application domain, whereas consistency models relate to the internal operation of the database itself. Both are important in that failure to support them can lead to corrupted data.
|
||||
|
||||
Integrity constraints
|
||||
=====================
|
||||
|
||||
Applications typically have integrity constraints defined by their domains. These constraints can take the form of type constraints on certain data values ("domain integrity"), a relation between two or more data values ("referential integrity"), or business rules drawn from the application domain.
|
||||
|
||||
In relational database management systems, integrity constraints are usually specified using SQL when the relational schema is designed. In this approach, the specification and enforcement of integrity constraints is tightly bound to the relational model. At the other extreme, most NoSQL databases simply do not support integrity constraints, shifting the burden of maintaining data integrity entirely onto the application developer.
|
||||
|
||||
FoundationDB takes a third approach. Because integrity constraints are defined by the application domain, the FoundationDB core does not directly enforce them. However, FoundationDB's transactions, with their guarantees of atomicity and isolation, give the application developer the power to straightforwardly maintain integrity constraints as the domain requires. In simple terms, so long as each transaction individually maintains the desired constraints, FoundationDB guarantees that multiple clients executing transactions simultaneously will also maintain those constraints. This approach allows a broad range of data models, including document-oriented or relational models to be built as :doc:`layers <layer-concept>` that maintain their own integrity constraints.
|
||||
|
||||
Consistency model
|
||||
=================
|
||||
|
||||
Consistency models serve to define the guarantees the database provides about when concurrent writes become visible to readers. Consistency models fall along a spectrum, depending on the strength of the guarantees. In general, stronger consistency models make reasoning about the database easier and speed development. For example, *causal* consistency guarantees that readers see all previously committed writes. *Eventual* consistency guarantees only that readers see writes after "sufficient time". Eventual consistency is the model used in many of the first-generation NoSQL systems.
|
||||
|
||||
FoundationDB provides the strongest possible consistency model, `sequential consistency <http://en.wikipedia.org/wiki/Sequential_consistency>`_ (closely related to `serializability <http://en.wikipedia.org/wiki/Serializability>`_ from the database literature), providing the greatest possible ease of development.
|
|
@ -0,0 +1,13 @@
|
|||
:orphan:
|
||||
|
||||
Site Map
|
||||
========
|
||||
|
||||
The full contents of this documentation site are listed below.
|
||||
If you are having trouble finding something in particular, try the search box in the navbar.
|
||||
|
||||
.. toctree::
|
||||
:glob:
|
||||
:includehidden:
|
||||
|
||||
index
|
|
@ -0,0 +1,546 @@
|
|||
.. default-domain:: py
|
||||
.. default-domain:: py
|
||||
.. highlight:: python
|
||||
.. module:: fdb
|
||||
|
||||
.. Required substitutions for api-common.rst.inc
|
||||
|
||||
.. |database-type| replace:: ``Database``
|
||||
.. |database-class| replace:: ``Database``
|
||||
.. |database-auto| replace:: FIXME
|
||||
.. |transaction-class| replace:: ``Transaction``
|
||||
.. |get-key-func| replace:: get_key()
|
||||
.. |get-range-func| replace:: get_range()
|
||||
.. |commit-func| replace:: FIXME
|
||||
.. |init-func| replace:: FIXME
|
||||
.. |open-func| replace:: FIXME
|
||||
.. |set-cluster-file-func| replace:: FIXME
|
||||
.. |set-local-address-func| replace:: FIXME
|
||||
.. |on-error-func| replace:: FIXME
|
||||
.. |null-type| replace:: FIXME
|
||||
.. |error-type| replace:: FIXME
|
||||
.. |error-raise-type| replace:: FIXME
|
||||
.. |reset-func-name| replace:: FIXME
|
||||
.. |reset-func| replace:: FIXME
|
||||
.. |cancel-func| replace:: FIXME
|
||||
.. |read-your-writes-disable-option| replace:: FIXME
|
||||
.. |future-cancel| replace:: FIXME
|
||||
.. |max-watches-database-option| replace:: FIXME
|
||||
.. |future-type-string| replace:: FIXME
|
||||
.. |lazy-iterator-object| replace:: FIXME
|
||||
.. |key-meth| replace:: FIXME
|
||||
.. |directory-subspace| replace:: FIXME
|
||||
.. |directory-layer| replace:: FIXME
|
||||
.. |subspace| replace:: FIXME
|
||||
.. |subspace-api| replace:: FIXME
|
||||
.. |as-foundationdb-key| replace:: FIXME
|
||||
.. |as-foundationdb-value| replace:: FIXME
|
||||
.. |tuple-layer| replace:: FIXME
|
||||
.. |dir-path-type| replace:: FIXME
|
||||
.. |node-subspace| replace:: FIXME
|
||||
.. |content-subspace| replace:: FIXME
|
||||
.. |allow-manual-prefixes| replace:: FIXME
|
||||
|
||||
.. include:: api-common.rst.inc
|
||||
|
||||
#############
|
||||
Data Modeling
|
||||
#############
|
||||
|
||||
FoundationDB's core provides a simple data model coupled with powerful transactions. This combination allows building richer data models and libraries that inherit the scalability, performance, and integrity of the database. The goal of data modeling is to design a mapping of data to keys and values that enables effective storage and retrieval. Good decisions will yield an extensible, efficient abstraction. This document covers the fundamentals of data modeling with FoundationDB.
|
||||
|
||||
* For general guidance on application development using FoundationDB, see :doc:`developer-guide`.
|
||||
* For detailed API documentation specific to each supported language, see :doc:`api-reference`.
|
||||
|
||||
The core data model
|
||||
===================
|
||||
|
||||
FoundationDB's core data model is an ordered key-value store. Also known as an ordered associative array, map, or dictionary, this is a common data structure composed of a collection of key-value pairs in which all keys are unique. Starting with this simple model, an application can create higher-level data models by mapping their elements to individual keys and values.
|
||||
|
||||
In FoundationDB, both keys and values are simple byte strings. Apart from storage and retrieval, the database does not interpret or depend on the content of values. In contrast, keys are treated as members of a total order, the lexicographic order over the underlying bytes, in which keys are sorted by each byte in order. For example:
|
||||
|
||||
* ``'0'`` is sorted before ``'1'``
|
||||
* ``'apple'`` is sorted before ``'banana'``
|
||||
* ``'apple'`` is sorted before ``'apple123'``
|
||||
* keys starting with ``'mytable\'`` are sorted together (e.g. ``'mytable\row1'``, ``'mytable\row2'``, ...)
|
||||
|
||||
The ordering of keys is especially relevant for range operations. An application should structure keys to produce an ordering that allows efficient data retrieval with range reads.
|
||||
|
||||
.. _encoding-data-types:
|
||||
|
||||
Encoding data types
|
||||
===================
|
||||
|
||||
Because keys and values in FoundationDB are always byte strings, an application developer must serialize other data types (e.g., integers, floats, arrays) before storing them in the database. For values, the main concerns for serialization are simply CPU and space efficiency. For keys, there's an additional consideration: it's often important for keys to preserve the order of the data types (whether primitive or composite) they encode. For example:
|
||||
|
||||
Integers
|
||||
--------
|
||||
|
||||
* The standard tuple layer provides an order-preserving, signed, variable length encoding.
|
||||
* For positive integers, a big-endian fixed length encoding is order-preserving.
|
||||
* For signed integers, a big-endian fixed length two's-complement encoding with the most significant (sign) bit inverted is order-preserving.
|
||||
|
||||
Unicode strings
|
||||
---------------
|
||||
|
||||
* For unicode strings ordered lexicographically by unicode code point, use UTF-8 encoding. (This approach is used by the tuple layer.)
|
||||
* For unicode strings ordered by a particular collation (for example, a case insensitive ordering for a particular language), use an appropriate string collation transformation and then apply UTF-8 encoding. Internationalization or "locale" libraries in most environments and programming languages provide a string collation transformation, for example `C <http://pubs.opengroup.org/onlinepubs/7908799/xsh/wcsxfrm.html>`_, `C++ <http://www.cplusplus.com/reference/std/locale/collate/transform/>`_, `Python <http://docs.python.org/py3k/library/locale.html#locale.strxfrm>`_, `Ruby <https://github.com/ninjudd/icunicode#readme>`_, `Java <http://docs.oracle.com/javase/1.5.0/docs/api/java/text/Collator.html#getCollationKey(java.lang.String)>`_, the `ICU <http://icu-project.org/apiref/icu4c/classCollator.html#ae524fd43a06d4429e2c76bef35874d4c>`_ library, etc. Usually the output of this function is a unicode string, which needs to be further encoded in a code-point ordered encoding such as UTF-8 to get a byte string.
|
||||
|
||||
Floating point numbers
|
||||
----------------------
|
||||
|
||||
The tuple layer provides an order-preserving, signed, fixed length encoding for both single- and double-precision floating point
|
||||
numbers based off of the IEEE big-endian encoding with some modifications to make it correctly ordered. Within this representation,
|
||||
-0 and +0 are not equal and negative NaN values will sort before all non-NaN values and positive NaN values will sort after
|
||||
all non-NaN values. Otherwise, the representation is consistent with the mathematical ordering.
|
||||
|
||||
Composite types
|
||||
---------------
|
||||
|
||||
An application's data is often represented using composite types, such as structures or records with multiple fields. It's very useful for the application to use *composite* keys to store such data. In FoundationDB, composite keys can be conveniently represented as *tuples* that are mapped to individual keys for storage.
|
||||
|
||||
.. note:: For the purpose of illustration, we'll use the FoundationDB's Python language binding, including the :py:func:`@fdb.transactional <fdb.transactional>` decorator described in :doc:`api-python`. The design patterns illustrated are applicable to all of the :doc:`languages <api-reference>` supported by FoundationDB.
|
||||
|
||||
.. _data-modeling-tuples:
|
||||
|
||||
Tuples
|
||||
======
|
||||
|
||||
FoundationDB's keys are ordered, making tuples a particularly useful tool for data modeling. FoundationDB provides a :ref:`tuple layer <api-python-tuple-layer>` (available in each language binding) that encodes tuples into keys. This layer lets you store data using a tuple like ``(state, county)`` as a key. Later, you can perform reads using a prefix like ``(state,)``. The layer works by preserving the natural ordering of the tuples.
|
||||
|
||||
You could implement a naive encoding of tuples of strings into keys by using a tab character as a simple delimiter. You could do this with the following Python code::
|
||||
|
||||
def tuple_to_key_with_tab(tup):
|
||||
return '\t'.join(str(i) for i in tup)
|
||||
|
||||
# Example: Order first by state, then by county
|
||||
@fdb.transactional
|
||||
def set_county_population(tr, state, county, pop):
|
||||
tr[tuple_to_key_with_tab((state, county))] = str(pop)
|
||||
|
||||
In this example, population figures for the United States are stored using keys formed from the tuple of state and county.
|
||||
|
||||
Of course, this encoding would only work if all the bytes in the individual keys in the tuple were greater than the delimiter byte. Therefore, FoundationDB's built-in tuple layer implements a more robust encoding supporting elements of various data types: byte strings, unicode strings, signed integers, floating-point numbers, booleans, UUIDs, null values, and nested tuples.
|
||||
|
||||
.. note:: The tuple layer's encoding is compatible between languages, although some languages are limited in what data types they support. For language-specific documentation of the tuple layer, see the corresponding :doc:`api-reference` documentation.
|
||||
|
||||
Because of its ordering of keys, FoundationDB supports efficient range reads on any set of keys that share a prefix. The tuple layer preserves the ordering of tuples sorted by element from left to right; as a result, the leftmost elements of a tuple will always represent a prefix in keyspace and can be used for range reads. A basic principle of data modeling with the tuple layer is to order tuple elements to facilitate such range reads. The examples below illustrate this principle.
|
||||
|
||||
Sometimes data attributes will have a natural order of containment imposed by your domain. A common example is geographic attributes, such as state and county in the Unites States. By constructing keys from tuples of the form ``(state, county)``, where state is the first tuple element, all data for states will be stored in an adjacent range of keys. This ordering allows you to retrieve the populations for all counties in a given state with a single range read. You could use the tuple layer with the following functions::
|
||||
|
||||
@fdb.transactional
|
||||
def set_county_population(tr, state, county, pop):
|
||||
tr[fdb.tuple.pack((state, county))] = str(pop)
|
||||
|
||||
@fdb.transactional
|
||||
def get_county_populations_in_state(tr, state):
|
||||
return [int(pop) for k, pop in tr[fdb.tuple.range((state,))]]
|
||||
|
||||
Date/timestamp attributes form another example with a natural containment order. If you have attributes of year, month, day, hour, minute, and/or second, you can order them from larger to smaller units in your keys. As a result, you'll be able to retrieve temporally contiguous data with range reads, as above.
|
||||
|
||||
A few simple models
|
||||
===================
|
||||
|
||||
Let's begin with a few examples of simple data models built on tuples with :ref:`subspaces <developer-guide-sub-keyspaces>`.
|
||||
|
||||
Arrays
|
||||
------
|
||||
|
||||
You can easily map arrays to the key-value store using tuples. To model a named, one-dimensional array, you can construct a key for each array element using the array name as the subspace and the array index as the second tuple element.
|
||||
|
||||
For example, suppose you have a ``'temps2012'`` array containing a year's worth of daily temperature averages. The temperatures are indexed by an integer ranging from 1 to 365 representing the day. Your keys would then be constructed from tuples of the form ``('temps2012', day)``.
|
||||
|
||||
To set and get array elements with this technique, you can use Python functions such as::
|
||||
|
||||
@fdb.transactional
|
||||
def array_set(tr, array_space, index, value):
|
||||
tr[array_space[index]] = str(value)
|
||||
|
||||
@fdb.transactional
|
||||
def array_get(tr, array_space, index):
|
||||
return tr[array_space[index]]
|
||||
|
||||
temp_array = Subspace(('temps2012',))
|
||||
|
||||
@fdb.transactional
|
||||
def add_temp(tr, day, temp):
|
||||
array_set(tr, temp_array, day, temp)
|
||||
|
||||
@fdb.transactional
|
||||
def get_temp(tr, day):
|
||||
val = array_get(tr, temp_array, day)
|
||||
if val.present():
|
||||
return int(val)
|
||||
else:
|
||||
return None
|
||||
|
||||
This approach has a few nice properties:
|
||||
|
||||
* It can be extended to multidimensional arrays simply by adding additional array indexes to the tuples.
|
||||
* Unassigned elements consume no storage, so sparse arrays are stored efficiently.
|
||||
|
||||
The tuple layer makes these properties easy to achieve, and most well-designed data models using tuples will share them.
|
||||
|
||||
An array can only have a single value for each index. Likewise, the key-value store can only have a single value for each key. The simple mapping above takes advantage of this correspondence to store the array value as a physical value. In contrast, some data structures are designed to store multiple values. In these cases, data models can store the logical values within the key itself, as illustrated next.
|
||||
|
||||
Multimaps
|
||||
---------
|
||||
A multimap is a generalization of an associative array in which each key may be associated with multiple values. Multimaps are often implemented as associative arrays in which the values are sets rather than primitive data types.
|
||||
|
||||
Suppose you have a multimap that records student enrollment in classes, with students as keys and classes as values. Each student can be enrolled in more than one class, so you need to map the key-value pairs of the multimap (with their multiple values) to the database.
|
||||
|
||||
A simple approach is to use the multimap name (say, ``'enroll'``) as the subspace and construct a key from a tuple of the form ``('enroll', student, class_name)`` for each class in which a student is enrolled. Each class will generate a unique key, allowing as many classes as needed. Moreover, all the data in the multimap will be captured in the key, so you can just use an empty string for its value. Using this approach, you can add a class for a student or get all the student's classes with the Python functions::
|
||||
|
||||
@fdb.transactional
|
||||
def multi_set(tr, multi_space, index, value):
|
||||
tr[multi_space.pack((index, value))] = ''
|
||||
|
||||
@fdb.transactional
|
||||
def multi_get(tr, multi_space, index):
|
||||
pairs = tr[multi_space.range((index,))]
|
||||
return [multi_space.unpack(k)[-1] for k, v in pairs]
|
||||
|
||||
@fdb.transactional
|
||||
def multi_is_element(tr, multi_space, index, value):
|
||||
val = tr[multi_space.pack((index, value))]
|
||||
return val.present()
|
||||
|
||||
enroll_space = Subspace(('enroll',))
|
||||
|
||||
@fdb.transactional
|
||||
def add_class(tr, student, class_name):
|
||||
multi_set(tr, enroll_space, student, class_name)
|
||||
|
||||
@fdb.transactional
|
||||
def get_classes(tr, student):
|
||||
return multi_get(tr, enroll_space, student)
|
||||
|
||||
The ``range()`` method in :py:func:`multi_get` returns all keys in the subspace that encode tuples with the specified tuple as a prefix. The ``[-1]`` extracts the last element of the tuple unpacked from the key, which in this case will encode a class.
|
||||
|
||||
As this model for multimaps illustrates, data that is treated as a value at one level may be mapped to a key in the database. (The reverse may also occur, as shown in the discussion of indirection below.) Data modeling in FoundationDB is not dictated by how your data is represented in your programming language.
|
||||
|
||||
.. _data-modeling-tables:
|
||||
|
||||
Tables
|
||||
------
|
||||
|
||||
You can easily use tuples to store data in tabular form with rows and columns. The simplest data model for a table is to make each cell in the table a key-value pair. To do this, you construct a key from a tuple containing the row and column identifiers. As with the array model, unassigned cells in tables constructed using this technique will consume no storage, so sparse tables can be stored efficiently. As a result, a table can safely have a very large number of columns.
|
||||
|
||||
You can make your model row-oriented or column-oriented by placing either the row or column first in the tuple, respectively. Because the lexicographic order sorts tuple elements from left to right, access is optimized for the element placed first. Placing the row first makes it efficient to read all the cells in a particular row; reversing the order makes reading a column more efficient.
|
||||
|
||||
Using the table name as the subspace, we could implement the common row-oriented version in Python as follows::
|
||||
|
||||
@fdb.transactional
|
||||
def table_set_cell(tr, table_space, row, column, value):
|
||||
tr[table_space.pack((row, column))] = str(value)
|
||||
|
||||
@fdb.transactional
|
||||
def table_get_cell(tr, table_space, row, column):
|
||||
return tr[table_space.pack((row, column))]
|
||||
|
||||
@fdb.transactional
|
||||
def table_set_row(tr, table_space, row, cols):
|
||||
del tr[table_space.range((row,))]
|
||||
for c, v in cols.iteritems():
|
||||
table_set_cell(tr, table_space, row, c, v)
|
||||
|
||||
@fdb.transactional
|
||||
def table_get_row(tr, table_space, row):
|
||||
cols = {}
|
||||
for k, v in tr[table_space.range((row,))]:
|
||||
_, c = table_space.unpack(k)
|
||||
cols[c] = v
|
||||
return cols
|
||||
|
||||
.. _data-modeling-entity-relationship:
|
||||
|
||||
Entity-relationship models
|
||||
==========================
|
||||
|
||||
Entity-relationship models are often used to describe a database at various levels of abstraction. In this methodology, a *logical* data model consisting of entities, attributes, and relationships is defined before mapping it to a *physical* data models specifying keys and other implementation features. Entity-relationship models can be easily modeled in FoundationDB using tuples.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
|
||||
Suppose you're storing entity-relationship data for users in an ``'ER'`` subspace. You might identify each entity with a unique identifier and define a key for each attribute with the tuple ``('ER', entity_ID, attribute)``. You could then store the user's region using the Python functions::
|
||||
|
||||
ER_space = Subspace(('ER',))
|
||||
|
||||
@fdb.transactional
|
||||
def add_attribute_value(tr, entity_ID, attribute, value):
|
||||
tr[ER_space.pack((entity_ID, attribute))] = str(value)
|
||||
|
||||
@fdb.transactional
|
||||
def get_attribute_value(tr, entity_ID, attribute):
|
||||
return tr[ER_space.pack((entity_ID, attribute))]
|
||||
|
||||
@fdb.transactional
|
||||
def add_user_region(tr, user_ID, region):
|
||||
add_attribute_value(tr, user_ID, 'region', region)
|
||||
|
||||
Relationships
|
||||
-------------
|
||||
|
||||
Using the pattern we saw above with multimaps, you can store relationships and related entities as an element of the key and use an empty string as the physical value. Suppose your users can belong to one or more groups. To add a user to a group or retrieve all groups to which a user belongs, you can use the Python functions::
|
||||
|
||||
@fdb.transactional
|
||||
def add_relationship(tr, relationship, primary_key, foreign_key):
|
||||
tr[ER_space.pack((relationship, primary_key, foreign_key))] = ''
|
||||
|
||||
@fdb.transactional
|
||||
def get_relationships(tr, relationship):
|
||||
return [ER_space.unpack(k)[1:]
|
||||
for k, v in tr.get_range_startswith(ER_space.pack((relationship,)),
|
||||
streaming_mode=fdb.StreamingMode.want_all)]
|
||||
|
||||
@fdb.transactional
|
||||
def get_related_entities(tr, relationship, primary_key):
|
||||
items = tr[ER_space.range((relationship, primary_key))]
|
||||
return [ER_space.unpack(k)[-1] for k, v in items]
|
||||
|
||||
@fdb.transactional
|
||||
def is_related_entity(tr, relationship, primary_key, foreign_key):
|
||||
return tr[ER_space.pack((relationship, primary_key, foreign_key))].present()
|
||||
|
||||
@fdb.transactional
|
||||
def add_user_to_group(tr, user_ID, group_name):
|
||||
add_relationship(tr, 'belongs_to', user_ID, group_name)
|
||||
|
||||
@fdb.transactional
|
||||
def get_users_groups(tr, user_ID):
|
||||
return get_related_entities(tr, 'belongs_to', user_ID)
|
||||
|
||||
You can extend this code by adding indexes for the related entities (see below) and enforcement of relationship cardinalities (one-to-many, etc.).
|
||||
|
||||
.. _data-modeling-indexes:
|
||||
|
||||
Indexes
|
||||
=======
|
||||
|
||||
A common technique is to store the same data in different ways to allow efficient retrieval for multiple use cases, creating indexes. This technique is especially useful when there are many more reads than writes. For example, you may find it most convenient to store user data based on ``user_ID`` but sometimes need to retrieve users based on their region. An index allows this retrieval to be performed efficiently.
|
||||
|
||||
An index can have a very simple tuple structure consisting of an unique subspace, the relationship being indexed, and a value: ``(subspace_for_index, relationship, value)``. Placing the relationship before the value is what allows efficient retrieval of all the associated values with a single range read.
|
||||
|
||||
With FoundationDB's transactions, you can easily build an index and guarantee that it stays in sync with the data: just update the index in the same transaction that updates the data.
|
||||
|
||||
|
||||
For example, suppose you'd like to add an index to efficiently look up users by region. You can augment the Python function :py:func:`add_user` with the index and add a new function for retrieval::
|
||||
|
||||
user_space = Subspace(('user',))
|
||||
region_index = Subspace(('region_idx',))
|
||||
|
||||
@fdb.transactional
|
||||
def add_user(tr, user_ID, name, region):
|
||||
tr[user_space.pack((user_ID, region))] = str(name)
|
||||
tr[region_index.pack((region, user_ID))] = ''
|
||||
|
||||
@fdb.transactional
|
||||
def get_users_in_region(tr, region):
|
||||
items = tr[region_index.range((region,))]
|
||||
return [region_index.unpack(k)[-1] for k, v in items]
|
||||
|
||||
To apply this technique to a real use case, you would add code to your update transaction to delete outdated index entries. Note that this approach lets you add as many indexes as desired by updating all the indexes in the same transaction.
|
||||
|
||||
Composite models
|
||||
================
|
||||
|
||||
Most of the techniques we've discussed can be freely combined. Let's look at adding indexes to our basic data model for tables.
|
||||
|
||||
We've already seen a way to store tabular data in a row-oriented order using table names as subspaces. You can extend this model by simultaneously storing the table in both row-oriented and column-oriented layouts, allowing efficient retrieval of either an entire row or an entire column. We'll create nested subspaces for the indexes using the subscripting syntax we saw above::
|
||||
|
||||
table_space = Subspace(('table',))
|
||||
row_index = table_space['row_idx']
|
||||
col_index = table_space['col_idx']
|
||||
|
||||
@fdb.transactional
|
||||
def table_set_cell(tr, row_index, col_index, row, column, value):
|
||||
tr[row_index.pack((row, column))] = str(value)
|
||||
tr[col_index.pack((column, row))] = str(value)
|
||||
|
||||
@fdb.transactional
|
||||
def table_get_cell(tr, row_index, row, column):
|
||||
return tr[row_index.pack((row, column))]
|
||||
|
||||
@fdb.transactional
|
||||
def table_get_row(tr, row_index, row):
|
||||
cols = {}
|
||||
for k, v in tr[row_index.range((row,))]:
|
||||
r, c = row_index.unpack(k)
|
||||
cols[c] = v
|
||||
return cols
|
||||
|
||||
@fdb.transactional
|
||||
def table_get_col(tr, col_index, col):
|
||||
rows = {}
|
||||
for k, v in tr[col_index.range((col,))]:
|
||||
c, r = col_index.unpack(k)
|
||||
rows[r] = v
|
||||
return rows
|
||||
|
||||
|
||||
.. _data-modeling-hierarchies:
|
||||
|
||||
Hierarchies
|
||||
===========
|
||||
|
||||
Many applications work with hierarchical data represented by nested dictionaries or similar composite data types. Such data is often serialized to or deserialized from a format such as JSON or XML. Looking at a hierarchical object as a tree, you can use a tuple to represent the full path to each leaf (sometimes called a "materialized path"). By storing each full path as a key, you get an index for each leaf. FoundationDB can then efficiently retrieve any individual piece of data or entire sub-tree.
|
||||
|
||||
For example, suppose you have hierarchical data such as the following nested dictionaries and lists::
|
||||
|
||||
{'user': { 'jones':
|
||||
{ 'friendOf': 'smith',
|
||||
'group': ['sales', 'service']},
|
||||
'smith':
|
||||
{ 'friendOf': 'jones',
|
||||
'group': ['dev', 'research']}}}
|
||||
|
||||
To distinguish the list elements from dictionary elements and preserve the order of the lists, you can just include the index of each list element before it in the tuple. Using this technique, the data above would be converted to the following tuples::
|
||||
|
||||
[('user', 'jones', 'friendOf', 'smith'),
|
||||
('user', 'jones', 'group', 0, 'sales'),
|
||||
('user', 'jones', 'group', 1, 'service'),
|
||||
('user', 'smith', 'friendOf', 'jones'),
|
||||
('user', 'smith', 'group', 0, 'dev'),
|
||||
('user', 'smith', 'group', 1, 'research')]
|
||||
|
||||
Suppose you'd like to use this representation to implement a nested keyspace, i.e., a key-value store in which values can themselves be nested dictionaries or lists. Your application receives a stream of serialized JSON objects in which different objects may contain data about the same entities, so you'd like to store the data in a common nested keyspace.
|
||||
|
||||
You can deserialize the data using Python's standard ``json`` module, generate the corresponding set of paths as tuples, and store each tuple in a ``'hier'`` subspace::
|
||||
|
||||
import json, itertools
|
||||
|
||||
hier_space = Subspace(('hier',))
|
||||
|
||||
EMPTY_OBJECT = -2
|
||||
EMPTY_ARRAY = -1
|
||||
|
||||
def to_tuples(item):
|
||||
if item == {}:
|
||||
return [(EMPTY_OBJECT, None)]
|
||||
elif item == []:
|
||||
return [(EMPTY_ARRAY, None)]
|
||||
elif type(item) == dict:
|
||||
return [(k,) + sub for k, v in item.iteritems() for sub in to_tuples(v)]
|
||||
elif type(item) == list:
|
||||
return [(k,) + sub for k, v in enumerate(item) for sub in to_tuples(v)]
|
||||
else:
|
||||
return [(item,)]
|
||||
|
||||
@fdb.transactional
|
||||
def insert_hier(tr, hier):
|
||||
if type(hier) == str:
|
||||
hier = json.loads(hier)
|
||||
for tup in to_tuples(hier):
|
||||
tr[hier_space.pack(tup)] = ''
|
||||
|
||||
You can then retrieve any sub-tree from the nested keyspace by giving the partial path to its root. The partial path will just be a tuple that your query function uses as a key prefix for a range read. For example, to retrieve the data for ``'smith'`` from the hierarchy above, you would use ``('user', 'smith')``.
|
||||
|
||||
The retrieved data will be a list of tuples. The final step before returning the data is to convert it back to a nested data structure::
|
||||
|
||||
def from_tuples(tuples):
|
||||
first = tuples[0] # The first tuple will tell us what kind of object we have
|
||||
|
||||
if len(first) == 1: return first[0] # Primitive value
|
||||
if first == (EMPTY_OBJECT,None): return {}
|
||||
if first == (EMPTY_ARRAY, None): return []
|
||||
|
||||
# For an object or array, we need to group the tuples by their first element
|
||||
groups = [list(g) for k, g in itertools.groupby(tuples, lambda t:t[0])]
|
||||
|
||||
if first[0] == 0: # array
|
||||
return [from_tuples([t[1:] for t in g]) for g in groups]
|
||||
else: # object
|
||||
return dict((g[0][0], from_tuples([t[1:] for t in g])) for g in groups)
|
||||
|
||||
@fdb.transactional
|
||||
def get_sub_hier(tr, prefix):
|
||||
return from_tuples([hier_space.unpack(k)
|
||||
for k, v in tr[hier_space.range(prefix)]])
|
||||
|
||||
.. _data-modeling-documents:
|
||||
|
||||
Documents
|
||||
=========
|
||||
|
||||
Suppose you'd like to use the above representation to implement a simple document-oriented data model. As before, your application receives serialized data in JSON, only now you'd like to store each JSON object as an independent document. To do so, you just need to ensure that each tuple created for that object is stored with a unique identifier for the document. If a ``doc_id`` has not already been supplied, you can randomly generate one.
|
||||
|
||||
To store a path, you can construct a composite key in a ``'doc'`` subspace, with the ``doc_id`` as the next element, followed by the remainder of the path. You can store the leaf (the last element of the tuple) as the value, which enables storage of larger data sizes (see :ref:`data-modeling-performance-guidelines`)::
|
||||
|
||||
import random
|
||||
|
||||
doc_space = Subspace(('doc',))
|
||||
|
||||
@fdb.transactional
|
||||
def insert_doc(tr, doc):
|
||||
if type(doc) == str:
|
||||
doc = json.loads(doc)
|
||||
if not 'doc_id' in doc:
|
||||
doc['doc_id'] = random.randint(0, 100000000)
|
||||
for tup in to_tuples( doc ):
|
||||
tr[doc_space.pack((doc['doc_id'],) + tup[:-1])] = fdb.tuple.pack((tup[-1],))
|
||||
return doc['doc_id']
|
||||
|
||||
@fdb.transactional
|
||||
def get_doc(tr, doc_id):
|
||||
return from_tuples([doc_space.unpack(k)[1:] + fdb.tuple.unpack(v)
|
||||
for k, v in tr[doc_space.range((doc_id,))]])
|
||||
|
||||
.. _data-modeling-indirection:
|
||||
|
||||
Indirection
|
||||
===========
|
||||
It is sometimes beneficial to add a level of indirection to a data model. Instead of using key-value pairs to directly store application data, you can instead store a reference to that data. This approach can be used to model any data structure that would normally use references. You just need to perform any modifications to the data structure in a transaction that leaves it in a consistent state.
|
||||
|
||||
Suppose you want to maintain data in a singly linked list. The application data can use a tuple structure like those of single-valued relationships. Links will be similar but will use node identifiers as their values. Here is an example of removing the next node from the list::
|
||||
|
||||
node_space = Subspace(('node',))
|
||||
|
||||
@fdb.transactional
|
||||
def remove_next_node(tr, node_ID):
|
||||
next_ID = tr[node_space.pack((node_ID, 'next'))]
|
||||
if next_ID != '':
|
||||
next_next_ID = tr[node_space.pack((next_ID, 'next'))]
|
||||
tr[node_space.pack((node_ID, 'next'))] = next_next_ID
|
||||
del tr[node_space.range((next_ID,))]
|
||||
|
||||
FoundationDB's transactional guarantees ensure that, even when multiple clients are concurrently modifying the same linked list, the structure will be maintained in a consistent way.
|
||||
|
||||
.. _data-modeling-performance-guidelines:
|
||||
|
||||
Key and value sizes
|
||||
===================
|
||||
|
||||
How you map your application data to keys and values can have a dramatic impact on performance. Below are some guidelines to consider as you design a data model. (For more general discussion of performance considerations, see :ref:`developer-guide-peformance-considerations`.)
|
||||
|
||||
* Structure keys so that range reads can efficiently retrieve the most frequently accessed data.
|
||||
|
||||
* If you perform a range read that is, in total, much more than 1 kB, try to restrict your range as much as you can while still retrieving the needed data.
|
||||
|
||||
* Structure keys so that no single key needs to be updated too frequently, which can cause transaction conflicts.
|
||||
|
||||
* If a key is updated more than 10-100 times per second, try to split it into multiple keys.
|
||||
* For example, if a key is storing a counter, split the counter into N separate counters that are randomly incremented by clients. The total value of the counter can then read by adding up the N individual ones.
|
||||
|
||||
* Keep key sizes small.
|
||||
|
||||
* Try to keep key sizes below 1 kB. (Performance will be best with key sizes below 32 bytes and *cannot* be more than 10 kB.)
|
||||
* When using the tuple layer to encode keys (as is recommended), select short strings or small integers for tuple elements. Small integers will encode to just two bytes.
|
||||
* If your key sizes are above 1 kB, try either to move data from the key to the value, split the key into multiple keys, or encode the parts of the key more efficiently (remembering to preserve any important ordering).
|
||||
|
||||
* Keep value sizes moderate.
|
||||
|
||||
* Try to keep value sizes below 10 kB. (Value sizes *cannot* be more than 100 kB.)
|
||||
* If your value sizes are above 10 kB, consider splitting the value across multiple keys.
|
||||
* If you read values with sizes above 1 kB but use only a part of each value, consider splitting the values using multiple keys.
|
||||
* If you frequently perform individual reads on a set of values that total to fewer than 200 bytes, try either to combine the values into a single value or to store the values in adjacent keys and use a range read.
|
||||
|
||||
Large Values and Blobs
|
||||
----------------------
|
||||
|
||||
If your keys or values are much larger than the above guidelines, it may be difficult to find a data model that resizes them appropriately. Unstructured data, such as binary large objects, can be especially challenging to segment manually. In this case, a good option is to use our blob layer. See our tutorial on :doc:`Managing Large Values and Blobs <largeval>` for further discussion.
|
||||
|
||||
|
||||
Data modeling examples: tutorials
|
||||
=================================
|
||||
|
||||
The :doc:`tutorials` provide examples of data modeling using the tuple layer. They use techniques applicable to all of the :doc:`languages <api-reference>` supported by FoundationDB.
|
|
@ -0,0 +1 @@
|
|||
.. note:: Because of a `bug in the Linux kernel <https://bugzilla.kernel.org/show_bug.cgi?id=43260>`_, **FoundationDB might deadlock when running on Ubuntu 12.04 or 12.10** using the default ext4 filesystem. This was fixed in the 3.7 kernel (released 12/10/2012) thanks to the `hard work of Dmitry Monakhov <http://lkml.indiana.edu/hypermail/linux/kernel/1210.0/03434.html>`_.
|
|
@ -0,0 +1,56 @@
|
|||
##############
|
||||
Design Recipes
|
||||
##############
|
||||
|
||||
Learn how to build new data models, indexes, and more on top of the FoundationDB API. For more background, check out the :doc:`client-design` documentation.
|
||||
|
||||
* :doc:`Blob <blob>`: Store binary large objects (blobs) in the database.
|
||||
|
||||
* :doc:`Hierarchical Documents <hierarchical-documents>`: Create a representation for hierarchical documents.
|
||||
|
||||
* :doc:`Multimaps <multimaps>`: Create a multimap data structure with multiset values.
|
||||
|
||||
* :doc:`Priority Queues <priority-queues>`: Create a data structure for priority queues supporting operations for push, pop_min, peek_min, pop_max, and peek_max.
|
||||
|
||||
* :doc:`Queues <queues>`: Create a queue data structure that supports FIFO operations.
|
||||
|
||||
* :doc:`Segmented Range Reads <segmented-range-reads>`: Perform range reads in calibrated batches.
|
||||
|
||||
* :doc:`Simple Indexes <simple-indexes>`: Add (one or more) indexes to allow efficient retrieval of data in multiple ways.
|
||||
|
||||
* :doc:`Spatial Indexing <spatial-indexing>`: Create a spatial index for the database.
|
||||
|
||||
* :doc:`Subspace Indirection <subspace-indirection>`: Employ subspace indirection to manage bulk inserts or similar long-running operations.
|
||||
|
||||
* :doc:`Tables <tables>`: Create a table data structure suitable for sparse data.
|
||||
|
||||
* :doc:`Vector <vector>`: Create a vector data structure.
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
:titlesonly:
|
||||
:hidden:
|
||||
|
||||
blob
|
||||
blob-java
|
||||
hierarchical-documents
|
||||
hierarchical-documents-java
|
||||
multimaps
|
||||
multimaps-java
|
||||
priority-queues
|
||||
priority-queues-java
|
||||
queues
|
||||
queues-java
|
||||
segmented-range-reads
|
||||
segmented-range-reads-java
|
||||
simple-indexes
|
||||
simple-indexes-java
|
||||
spatial-indexing
|
||||
spatial-indexing-java
|
||||
subspace-indirection
|
||||
subspace-indirection-java
|
||||
tables
|
||||
tables-java
|
||||
vector
|
||||
vector-java
|
||||
|
|
@ -0,0 +1,797 @@
|
|||
.. default-domain:: py
|
||||
.. default-domain:: py
|
||||
.. highlight:: python
|
||||
.. module:: fdb
|
||||
|
||||
.. Required substitutions for api-common.rst.inc
|
||||
|
||||
.. |database-type| replace:: ``Database``
|
||||
.. |database-class| replace:: ``Database``
|
||||
.. |database-auto| replace:: FIXME
|
||||
.. |transaction-class| replace:: ``Transaction``
|
||||
.. |get-key-func| replace:: get_key()
|
||||
.. |get-range-func| replace:: get_range()
|
||||
.. |commit-func| replace:: ``commit()``
|
||||
.. |init-func| replace:: FIXME
|
||||
.. |open-func| replace:: FIXME
|
||||
.. |set-cluster-file-func| replace:: FIXME
|
||||
.. |set-local-address-func| replace:: FIXME
|
||||
.. |on-error-func| replace:: ``on_error()``
|
||||
.. |null-type| replace:: FIXME
|
||||
.. |error-type| replace:: exception
|
||||
.. |error-raise-type| replace:: raise
|
||||
.. |reset-func-name| replace:: FIXME
|
||||
.. |reset-func| replace:: FIXME
|
||||
.. |cancel-func| replace:: FIXME
|
||||
.. |read-your-writes-disable-option| replace:: FIXME
|
||||
.. |future-cancel| replace:: FIXME
|
||||
.. |max-watches-database-option| replace:: FIXME
|
||||
.. |future-type-string| replace:: FIXME
|
||||
.. |lazy-iterator-object| replace:: FIXME
|
||||
.. |key-meth| replace:: FIXME
|
||||
.. |directory-subspace| replace:: FIXME
|
||||
.. |directory-layer| replace:: FIXME
|
||||
.. |subspace| replace:: FIXME
|
||||
.. |subspace-api| replace:: FIXME
|
||||
.. |as-foundationdb-key| replace:: FIXME
|
||||
.. |as-foundationdb-value| replace:: FIXME
|
||||
.. |tuple-layer| replace:: FIXME
|
||||
.. |dir-path-type| replace:: FIXME
|
||||
.. |node-subspace| replace:: FIXME
|
||||
.. |content-subspace| replace:: FIXME
|
||||
.. |allow-manual-prefixes| replace:: FIXME
|
||||
|
||||
.. include:: api-common.rst.inc
|
||||
|
||||
###############
|
||||
Developer Guide
|
||||
###############
|
||||
|
||||
FoundationDB's scalability and performance make it an ideal back end for supporting the operation of critical applications. FoundationDB provides a simple data model coupled with powerful transactional integrity. This document gives an overview of application development using FoundationDB, including use of the API, working with transactions, and performance considerations.
|
||||
|
||||
.. note:: For operational advice about how to setup and maintain a FoundationDB cluster, see :doc:`administration`.
|
||||
|
||||
Data model
|
||||
==========
|
||||
|
||||
FoundationDB's core data model is an ordered key-value store. Also known as an ordered associative array, map, or dictionary, this is a data structure composed of a collection of key-value pairs in which all keys are unique and ordered. Both keys and values in FoundationDB are simple byte strings. Apart from storage and retrieval, the database does not interpret or depend on the content of values. In contrast, keys are treated as members of a total order, the lexicographic order over the underlying bytes, in which keys are sorted by each byte in order.
|
||||
|
||||
The combination of the core data model and multikey transactions allows an application to build richer data models and libraries that inherit FoundationDB's scalability, performance, and integrity. Richer data models are designed by mapping the application's data to keys and values in a way that yields an effective abstraction and enables efficient storage and retrieval.
|
||||
|
||||
.. note:: For guidance on mapping richer data models to FoundationDB's core, see :doc:`data-modeling`.
|
||||
|
||||
.. _developer-guide-namespace-management:
|
||||
|
||||
Namespace management
|
||||
====================
|
||||
|
||||
The keys in FoundationDB's key-value store can be viewed as elements of a single, global keyspace. Your application will probably have multiple kinds of data to store, and it's a good idea to separate them into different `namespaces <http://en.wikipedia.org/wiki/Namespace_(computer_science)>`_. The use of distinct namespaces will allow you to avoid conflicts among keys as your application grows.
|
||||
|
||||
Because of the ordering of keys, a namespace in FoundationDB is defined by any prefix prepended to keys. For example, if we use a prefix ``'alpha'``, any key of the form ``'alpha' + remainder`` will be nested under ``'alpha'``. Although you can manually manage prefixes, it is more convenient to use the :ref:`tuple layer <data-modeling-tuples>`. To define a namespace with the tuple layer, just create a tuple ``(namespace_id)`` with an identifier for the namespace. You can add a new key ``(foo, bar)`` to the namespace by extending the tuple: ``(namespace_id, foo, bar)``. You can also create nested namespaces by extending your tuple with another namespace identifier: ``(namespace_id, nested_id)``. The tuple layer automatically encodes each of these tuple as a byte string that preserves its intended order.
|
||||
|
||||
.. note:: :ref:`Subspaces <developer-guide-sub-keyspaces>` employ the tuple layer to provide a convenient syntax for namespaces.
|
||||
|
||||
.. _developer-guide-sub-keyspaces:
|
||||
|
||||
Subspaces
|
||||
---------
|
||||
|
||||
Subspaces provide the recommended way to define :ref:`namespaces <developer-guide-namespace-management>` for different categories of data. As a best practice, you should always use at least one subspace as a namespace for your application data.
|
||||
|
||||
.. note:: While subspaces can be used directly, they are also returned when creating or opening a :ref:`directory <developer-guide-directories>`. Directories are designed to manage related subspaces.
|
||||
|
||||
Each FoundationDB language binding provides a :ref:`Subspace class <api-python-subspaces>` to help use subspaces uniformly. An instance of the class stores a prefix used to identify the namespace and automatically prepends it when encoding tuples into keys. Likewise, it removes the prefix when decoding keys. A subspace can be initialized by supplying it with a prefix tuple::
|
||||
|
||||
my_space = Subspace(prefix_tuple)
|
||||
|
||||
It can also optionally take a byte string as a prefix element that will be prepended to all keys packed by the subspace::
|
||||
|
||||
my_space = Subspace(prefix_tuple, prefix_element)
|
||||
|
||||
.. note:: A subspace formed with a byte string as a prefix element is not fully compatible with the tuple layer. Keys stored within it cannot be unpacked as tuples.
|
||||
|
||||
In many of the language bindings, index notation can be used on a subspace to create a nested subspace. The new subspace will have the same prefix extended by the index value::
|
||||
|
||||
my_space = Subspace(('foo',))
|
||||
|
||||
# new_space will have prefix tuple ('foo', 'bar')
|
||||
new_space = my_space['bar']
|
||||
|
||||
The :class:`Subspace` methods will then work as expected for ``new_space``, with its prefix nested within that of ``my_space``.
|
||||
|
||||
For example, suppose your application tracks profile data for your users. You could store the data in a ``user_space`` subspace that would make ``'user'`` the first element of each tuple. A back-end function might have the form::
|
||||
|
||||
user_space = Subspace(('user',))
|
||||
|
||||
@fdb.transactional
|
||||
def set_user_data(tr, key, value):
|
||||
tr[user_space.pack((key,))] = str(value)
|
||||
|
||||
Subspaces support the :ref:`as_foundationdb_key <api-python-keys>` method to implicitly pack keys, so you could also write the ``set_user_data`` as::
|
||||
|
||||
@fdb.transactional
|
||||
def set_user_data(tr, key, value):
|
||||
tr[user_space[key]] = str(value)
|
||||
|
||||
Finally, you may want to clear a subspace before working with it (as long as you're sure it should be empty)::
|
||||
|
||||
@fdb.transactional
|
||||
def clear_subspace(tr, subspace):
|
||||
tr.clear_range_startswith(subspace.key())
|
||||
|
||||
.. _developer-guide-directories:
|
||||
|
||||
Directories
|
||||
-----------
|
||||
|
||||
FoundationDB provides :ref:`directories <api-python-directories>` (available in each language binding) as a tool for managing related :ref:`subspaces <developer-guide-sub-keyspaces>`. Directories are a recommended approach for administering applications. Each application should create or open at least one directory to manage its subspaces.
|
||||
|
||||
Directories are identified by hierarchical paths analogous to the paths in a Unix-like file system. A path is represented as a tuple of strings. Each directory has an associated subspace used to store its content. The directory layer maps each path to a short prefix used for the corresponding subspace. In effect, directories provide a level of indirection for access to subspaces.
|
||||
|
||||
This design has significant benefits: while directories are logically hierarchical as represented by their paths, their subspaces are not physically nested in a corresponding way. For example, suppose we create a few directories with increasing paths, such as::
|
||||
|
||||
>>> a = fdb.directory.create(db, ('alpha',))
|
||||
>>> b = fdb.directory.create(db, ('alpha', 'bravo'))
|
||||
>>> c = fdb.directory.create(db, ('alpha', 'bravo', 'charlie'))
|
||||
|
||||
The prefixes of ``a``, ``b``, and ``c`` are allocated independently and will usually not increase in length. The indirection from paths to subspaces allows keys to be kept short and makes it fast to move directories (i.e., rename their paths).
|
||||
|
||||
Paths in the directory layer are always relative. In particular, paths are interpreted relative to the directory in which an operation is performed. For example, we could have created the directories ``a``, ``b``, and ``c`` as follows::
|
||||
|
||||
>>> a = fdb.directory.create(db, ('alpha',))
|
||||
>>> b = a.create(db, ('bravo',))
|
||||
>>> c = b.create(db, ('charlie',))
|
||||
|
||||
We can easily check that the resulting paths are the same as before::
|
||||
|
||||
>>> a.get_path()
|
||||
(u'alpha',)
|
||||
>>> b.get_path()
|
||||
(u'alpha', u'bravo')
|
||||
>>> c.get_path()
|
||||
(u'alpha', u'bravo', u'charlie')
|
||||
|
||||
Usage
|
||||
~~~~~
|
||||
|
||||
The directory layer exposes a ``DirectoryLayer`` class and a ready-to-use instance of it that you can access as ``fdb.directory``. You can also create your own instance, which allows you to specify your own prefix for a subspace. For example, in Python, you could use::
|
||||
|
||||
>>> dir_layer = DirectoryLayer(content_subspace = Subspace(rawPrefix='\x01'))
|
||||
|
||||
You can use a ``DirectoryLayer`` instance to create a new directory, specifying its path as a tuple. The ``create`` method returns a newly created directory::
|
||||
|
||||
>>> users = fdb.directory.create(db, ('users',))
|
||||
>>> users
|
||||
DirectorySubspace(path=(u'users',), prefix='\x157')
|
||||
|
||||
A directory returned by ``create`` is a ``DirectorySubspace`` that fulfills the interface of both a ``DirectoryLayer`` and a ``Subspace``. Therefore, the directory can be used to access subdirectories recursively:
|
||||
|
||||
>>> inactive = users.create(db, ('inactive',))
|
||||
>>> inactive
|
||||
DirectorySubspace(path=(u'users', u'inactive'), prefix='\x15&')
|
||||
|
||||
The directory can also be used as a subspace to store content::
|
||||
|
||||
>>> db[users['Smith']] = ''
|
||||
|
||||
The directory layer uses a high-contention allocator to efficiently map the path to a short prefix for the directory's subspace.
|
||||
|
||||
If the directory was created previously (e.g., in a prior session or by another client), you can open it via its path. Like ``create``, the ``open`` method returns a directory::
|
||||
|
||||
>>> users = fdb.directory.open(db, ('users',))
|
||||
|
||||
It's often convenient to use a combined ``create_or_open`` method::
|
||||
|
||||
>>> products = fdb.directory.create_or_open(db, ('products',))
|
||||
>>> orders = fdb.directory.create_or_open(db, ('orders',))
|
||||
|
||||
As noted above, all of the operations defined for a ``DirectoryLayer`` can also be called on a directory to operate on its subdirectories::
|
||||
|
||||
>>> cancelled_orders = orders.create_or_open(db, ('cancelled',))
|
||||
|
||||
Once created, directory paths can be changed using ``move``::
|
||||
|
||||
>>> store = fdb.directory.create_or_open(db, ('store',))
|
||||
>>> users = fdb.directory.move(db, ('users',), ('store', 'users'))
|
||||
>>> products = fdb.directory.move(db, ('products',), ('store', 'products'))
|
||||
|
||||
A directory path can also be changed via its subspace using ``move_to``::
|
||||
|
||||
>>> orders = orders.move_to(db, ('store', 'orders'))
|
||||
|
||||
You can list the subdirectories of a directory. ``list`` returns directory names (the unicode string for the last component of the path), not subspaces or their contents::
|
||||
|
||||
>>> fdb.directory.list(db)
|
||||
[u'store']
|
||||
>>> store.list(db) # or fdb.directory.list(db, ('store',))
|
||||
[u'orders', u'products', u'users']
|
||||
>>> store.list(db, ('orders'))
|
||||
[u'cancelled']
|
||||
|
||||
Directories can be removed, with or without a prior test for existence::
|
||||
|
||||
>>> fdb.directory.remove_if_exists(db, ('store', 'temp'))
|
||||
>>> users.remove(db)
|
||||
|
||||
Although ``create_or_open`` and ``remove_if_exists`` cover the most common cases, you can also explicitly test for the existence of a directory::
|
||||
|
||||
>>> users.exists(db)
|
||||
False
|
||||
>>> store.exists(db, ('products',))
|
||||
True
|
||||
|
||||
Subdirectories and nested subspaces
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
It's common to have a subset of data that we want to arrange hierarchically under a directory. We can do this in two ways: by creating a subdirectory or a nested subspace. Let's return to our ``users`` directory for a closer look.
|
||||
|
||||
We may have data that is logically subordinate to our primary data but needs to be handled distinctly. For example, suppose we have inactive users that we want to store separately from our regular users. We could create a subdirectory for them::
|
||||
|
||||
>>> inactive = users.create(db, ('inactive',))
|
||||
>>> inactive
|
||||
DirectorySubspace(path=(u'users', u'inactive'), prefix='\x15&')
|
||||
|
||||
A subdirectory's data is stored under a prefix unrelated to that of its parent, which allows the prefix to be kept short and makes the subdirectory fast and easy to move. The managed prefix also makes it harder to jointly access data across subdirectories (e.g., you cannot usefully perform a range read across subdirectories).
|
||||
|
||||
Conversely, we often have data that we want to access as part of a single data set. For example, suppose we want ``users`` to store a ``user_ID`` and related attributes. We could nest a subspace for ``'ID'`` under ``users`` and store each attribute beneath it::
|
||||
|
||||
>>> db[ users['ID'][user_ID][lastname][firstname] ] = ''
|
||||
|
||||
Here, we're just employing ``users`` as a subspace, so all of the data modeling techniques using :ref:`subspaces <developer-guide-sub-keyspaces>` are available. Of course, data stored in a nested subspace cannot be moved as easily as a subdirectory.
|
||||
|
||||
Directory partitions
|
||||
~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Under normal operation, a directory does not share a common key prefix with its subdirectories. As a result, related directories are not necessarily located together in key-space. This means that you cannot use a single range query to read all contents of a directory and its descendants simultaneously, for example.
|
||||
|
||||
For most applications this behavior is acceptable, but in some cases it may be useful to have a directory tree in your hierarchy where every directory shares a common prefix. For this purpose, the directory layer supports creating *partitions* within a directory. A partition is a directory whose prefix is prepended to all of its descendant directories' prefixes.
|
||||
|
||||
A partition can be created by specifying the byte string 'partition' for the layer parameter::
|
||||
|
||||
>>> partition = fdb.directory.create(db, ('p1',), layer=b'partition')
|
||||
>>> users = partition.create_or_open(db, ('users',))
|
||||
|
||||
Directory partitions have the following drawbacks, and in general they should not be used unless specifically needed:
|
||||
|
||||
* Directories cannot be moved between different partitions.
|
||||
* Directories in a partition have longer prefixes than their counterparts outside of partitions, which reduces performance. Nesting partitions inside of other partitions results in even longer prefixes.
|
||||
* The root directory of a partition cannot be used to pack/unpack keys and therefore cannot be used to create subspaces. You must create at least one subdirectory of a partition in order to store content in it.
|
||||
|
||||
Working with the APIs
|
||||
=====================
|
||||
|
||||
FoundationDB supports client APIs for Python, Ruby, Node.js, Java, Go, and C. At a high level, each of these APIs support transactions allowing a client to:
|
||||
|
||||
* Set a key-value pair
|
||||
* Get the value associated with a key
|
||||
* Resolve a :ref:`key selector <system-keys>` to a key
|
||||
* Get a range of key-value pairs (The range can be specified with keys or with key selectors)
|
||||
* Clear a key (and its associated value)
|
||||
* Clear a range of keys (and their values)
|
||||
|
||||
.. note:: For details on the language-specific APIs, see the corresponding document under :doc:`api-reference`.
|
||||
|
||||
.. _system-keys:
|
||||
|
||||
.. note:: All keys that start with the byte ``0xFF`` (255) are reserved for internal system use and should not be modified by the user. They cannot be read or written in a transaction unless it sets the ``ACCESS_SYSTEM_KEYS`` transaction option. Note that, among many other options, simply prepending a single zero-byte to all user-specified keys will avoid the reserved range and create a clean key space.
|
||||
|
||||
.. _key selectors:
|
||||
|
||||
Key selectors
|
||||
-------------
|
||||
|
||||
|keysel-blurb1|
|
||||
|
||||
Each language API exposes constructors for four common forms of key selector. For example, in Python:
|
||||
|
||||
last_less_than(key)
|
||||
The lexicographically greatest key present in the database which is lexicographically strictly less than the given byte string key.
|
||||
|
||||
last_less_or_equal(key)
|
||||
The lexicographically greatest key present in the database which is lexicographically less than or equal to the given byte string key.
|
||||
|
||||
first_greater_than(key)
|
||||
The lexicographically least key present in the database which is lexicographically strictly greater than the given byte string key.
|
||||
|
||||
first_greater_or_equal(key)
|
||||
The lexicographically least key present in the database which is lexicographically greater than or equal to the given byte string key.
|
||||
|
||||
For example, suppose you want to read a range from a ``begin`` key to an ``end`` key *inclusive*. The :meth:`Transaction.get_range` method returns a range *exclusive* of its end key, so we can use a key selector to retrieve the desired range::
|
||||
|
||||
X = tr.get_range(begin, fdb.KeySelector.first_greater_than(end))
|
||||
|
||||
You can add or subtract an offset to or from a key selector. For example, in Python::
|
||||
|
||||
fdb.KeySelector.first_greater_than('apple') + 1
|
||||
|
||||
selects the second key following apple.
|
||||
|
||||
.. note:: The current version of FoundationDB *does not* resolve key selectors with large offsets in O(1) time. See the :ref:`dont-use-key-selectors-for-paging` known limitation.
|
||||
|
||||
All possible key selectors can be constructed using one of the four "common form" constructors and a positive or negative offset. Alternatively, general key selectors can be manually constructed by specifying:
|
||||
|
||||
1. A reference key
|
||||
2. An equality flag (boolean)
|
||||
3. An offset (integer)
|
||||
|
||||
To "resolve" these key selectors FoundationDB first finds the last key less than the reference key (or equal to the reference key, if the equality flag is true), then moves forward a number of keys equal to the offset (or backwards, if the offset is negative). If a key selector would otherwise describe a key off the beginning of the database (before the first key), it instead resolves to the empty key ``''``. If it would otherwise describe a key off the end of the database (after the last key), it instead resolves to the key ``'\xFF'`` (or ``'\xFF\xFF'`` if the transaction has been granted access to the system keys).
|
||||
|
||||
Transaction basics
|
||||
==================
|
||||
|
||||
Transactions in FoundationDB
|
||||
----------------------------
|
||||
|
||||
FoundationDB provides concurrency control via transactions, allowing multiple clients to concurrently read and write data in the database with strong guarantees about how they affect each other. Specifically, FoundationDB provides global, ACID transactions with serializable isolation using optimistic concurrency.
|
||||
|
||||
All reads and modifications of key-value pairs in FoundationDB are done within the context of a transaction. A transaction is a small unit of work that is both reliably performed and logically independent of other transactions.
|
||||
|
||||
In Python, a simple transaction looks like this::
|
||||
|
||||
@fdb.transactional
|
||||
def example(tr):
|
||||
# Read two values from the database
|
||||
a = tr.get('a')
|
||||
b = tr.get('b')
|
||||
# Write two key-value pairs to the database
|
||||
tr.set('c', a+b)
|
||||
tr.set('d', a+b)
|
||||
|
||||
example(db)
|
||||
|
||||
Once a call to the ``example()`` function returns, it is as if all "gets" and "sets" inside it happened at a single instant (at some point after ``example()`` was called and before it returned). If it *never* returns (suffers a power failure, raises an exception, etc.) then it is either as if the "get" and "set" methods all happened at an instantaneous point in time, or as if none of them happened.
|
||||
|
||||
.. _ACID:
|
||||
|
||||
This ensures the following properties, known collectively as "ACID":
|
||||
|
||||
* **Atomicity**: Either all of the writes in the transaction happen, or none of them happen.
|
||||
* **Consistency**: If each individual transaction maintains a database invariant (for example, from above, that the ``'c'`` and ``'d'`` keys always have the same value), then the invariant is maintained even when multiple transactions are modifying the database concurrently.
|
||||
* **Isolation**: It is as if transactions executed one at a time (serializability).
|
||||
* **Durability**: Once a transaction succeeds, its writes will not be lost despite any failures or network partitions.
|
||||
|
||||
An additional important property, though technically not part of ACID, is also guaranteed:
|
||||
|
||||
* **Causality**: A transaction is guaranteed to see the effects of all other transactions that commit before it begins.
|
||||
|
||||
FoundationDB implements these properties using multiversion concurrency control (MVCC) for reads and optimistic concurrency for writes. As a result, neither reads nor writes are blocked by other readers or writers. Instead, conflicting transactions will fail at commit time and will usually be retried by the client.
|
||||
|
||||
In particular, the reads in a transaction take place from an instantaneous snapshot of the database. From the perspective of the transaction this snapshot is not modified by the writes of other, concurrent transactions. When the transaction is ready to be committed, the FoundationDB cluster checks that it does not conflict with any previously committed transaction (i.e. that no value read by a transaction has been modified by another transaction since the read occurred) and, if it does conflict, rejects it. Rejected conflicting transactions are usually retried by the client. Accepted transactions are written to disk on multiple cluster nodes and then reported accepted to the client.
|
||||
|
||||
* For more background on transactions, see Wikipedia articles for `Database transaction <http://en.wikipedia.org/wiki/Database_transaction>`_, `Atomicity (database systems) <http://en.wikipedia.org/wiki/Atomicity_(database_systems)>`_, and `Concurrency Control <http://en.wikipedia.org/wiki/Concurrency_control>`_.
|
||||
|
||||
Transaction retry loops
|
||||
-----------------------
|
||||
|
||||
In most client APIs, there is a way of encapsulating a block of code as part of a transaction. For example, in Python, the ``@fdb.transactional`` decorator encapsulates retrying errors for the client. Here is a more detailed view of what the encapsulation does::
|
||||
|
||||
def example_encapsulated(db):
|
||||
# make a new Transaction object on the database
|
||||
tr = db.create_transaction()
|
||||
|
||||
while True:
|
||||
try:
|
||||
# Read two values from the transaction snapshot
|
||||
a = tr.get('a')
|
||||
b = tr.get('b')
|
||||
# Write two key-value pairs to the transaction snapshot
|
||||
tr.set('c', a+b)
|
||||
tr.set('d', a+b)
|
||||
|
||||
# Try to commit the transaction, and wait for the result
|
||||
tr.commit().wait()
|
||||
|
||||
# Success!
|
||||
return
|
||||
except fdb.FDBError as error:
|
||||
# The transaction conflicted or there was a transient failure.
|
||||
# Ask the FDB API whether and when to retry the error
|
||||
# (Also resets the tr object to be ready to use again)
|
||||
tr.on_error(error).wait()
|
||||
|
||||
Note that, while the ``@fdb.transactional`` decorator encapsulates an entire function, only the database operations it contains are part of the transaction and enjoy enforcement of ACID properties. Operations that mutate client memory (e.g., ordinary variable assignments or changes to data structures) are not "rolled back" when a transaction conflicts. You should place such operations outside of the retry loop unless it is acceptable for them to be executed when a transaction conflicts and is retried. See :ref:`developer-guide-unknown-results` for an example.
|
||||
|
||||
.. _developer-guide-programming-with-futures:
|
||||
|
||||
Programming with futures
|
||||
------------------------
|
||||
|
||||
Many of FoundationDB's API functions are *asynchronous*: rather than blocking the calling thread until the result is available they immediately return a *future* object. A future represents a value (or error) to be available at some later time.
|
||||
|
||||
For languages in which it's supported, the simplest approach to futures is implicit blocking. This involves using the future as if it were an object of the result type. For example, the following Python code uses futures to do multiple read operations in parallel, thereby reducing transaction latency. The reads of the 'A' and 'B' keys will be done in parallel because ``a`` and ``b`` are futures::
|
||||
|
||||
# tr is a transaction
|
||||
# Read two values from the database in parallel
|
||||
a = tr.get('A')
|
||||
b = tr.get('B')
|
||||
print a+b
|
||||
|
||||
The addition of ``a`` and ``b`` implicitly blocks. You can also explicitly block on a future
|
||||
until it's ready.
|
||||
|
||||
By default, FoundationDB supports read-your-writes, meaning that reads reflect the results of prior writes within a transaction. FoundationDB maximizes parallelism of database operations within a transaction, subject to enforcing the sequential semantics of those operations, such as when a key is written before it is read.
|
||||
|
||||
Another approach to programming with futures in FoundationDB is to set a callback function to be invoked asynchronously when the future is ready.
|
||||
|
||||
.. note:: Be very careful when mixing callbacks with explicit or implicit blocking. Blocking in a callback on a non-ready future will cause a deadlock. Blocking on anything else or performing CPU intensive tasks will block the FoundationDB client thread and therefore all database access from that client.
|
||||
|
||||
For further details, see the :doc:`API reference <api-reference>` documentation for your language.
|
||||
|
||||
.. _developer-guide-range-reads:
|
||||
|
||||
Range reads
|
||||
-----------
|
||||
|
||||
FoundationDB supports efficient range reads based on the lexicographic ordering of keys. Range reads are a powerful technique commonly used with FoundationDB. A recommended approach is to design your keys so that you can use range reads to retrieve your most frequently accessed data.
|
||||
|
||||
A range read within a transaction returns a container that issues asynchronous reads to the database. The client usually processes the data by iterating over the values returned by the container. Range reads can be specified explicitly by giving the ``begin`` and ``end`` of a range::
|
||||
|
||||
for k, v in tr.get_range('a', 'm'):
|
||||
print(k, v)
|
||||
|
||||
To define ranges that extend from the beginning the database, you can use the empty string ``''``::
|
||||
|
||||
for k, v in tr.get_range('', 'm'):
|
||||
print(k, v)
|
||||
|
||||
Likewise, to define ranges that extend to the end of the database, you can use the key ``'\xFF'``::
|
||||
|
||||
for k, v in tr.get_range('m', '\xFF'):
|
||||
print(k, v)
|
||||
|
||||
A range read can also retrieve all keys starting with a given ``prefix``::
|
||||
|
||||
for k, v in tr.get_range_startswith('a'):
|
||||
print(k, v)
|
||||
|
||||
The API balances latency and bandwidth by fetching data in batches as determined by the :ref:`streaming mode <streaming-mode-python>` parameter. Streaming modes allow you to customize this balance based on how you intend to consume the data. The default streaming mode (``iterator``) is quite efficient. However, if you anticipate that your range read will retrieve a large amount of data, you should select a streaming mode to match your use case. For example, if you're iterating through a large range and testing against a condition that may result in early termination, you may want to use the ``small`` streaming mode::
|
||||
|
||||
for k, v in tr.get_range_startswith('a', streaming_mode=fdb.StreamingMode.small):
|
||||
if halting_condition(k, v): break
|
||||
print(k, v)
|
||||
|
||||
In some situations, you may want to explicitly control the number of key-value pairs returned. You can use the ``limit`` parameter for this purpose. For example, suppose you want to iterate through a range while retrieving blocks of a predetermined size::
|
||||
|
||||
LIMIT = 100 # adjust according to data characteristics
|
||||
|
||||
@fdb.transactional
|
||||
def get_range_limited(tr, begin, end):
|
||||
keys_found = True
|
||||
while keys_found:
|
||||
keys_found = []
|
||||
for k, v in tr.get_range(begin, end, limit=LIMIT):
|
||||
keys_found.append(k)
|
||||
if keys_found:
|
||||
begin = fdb.KeySelector.first_greater_than(keys_found[-1])
|
||||
yield keys_found
|
||||
|
||||
For very large range reads, you can use multiple clients to perform reads concurrently. In this case, you'll want to estimate sub-ranges of roughly equal size based on the distribution of your keys. The :ref:`locality <api-python-locality>` functions can be used to derive estimates for the boundaries between sub-ranges.
|
||||
|
||||
.. _developer-guide-long-transactions:
|
||||
|
||||
Long-running transactions
|
||||
-------------------------
|
||||
|
||||
FoundationDB does not support long-running transactions, currently defined as
|
||||
those lasting over five seconds. The reasons for this design limitation relate
|
||||
to multiversion concurrency control and are discussed in :doc:`anti-features`.
|
||||
|
||||
You may have certain large operations that you're accustomed to implementing as
|
||||
long-running transactions with another database. How should you approach implementing
|
||||
your operation in FoundationDB?
|
||||
|
||||
The key consideration is whether your operation requires global consistency over
|
||||
all its data elements. In many cases, some smaller scope of consistency is acceptable.
|
||||
For example, many analytic computations are defined over a set of entities, such
|
||||
as users, and require consistency only for each entity, not globally across them.
|
||||
In this case, you can decompose the operation into multiple transactions, one for
|
||||
each entity. More generally, the strategy for operations requiring local consistency
|
||||
is to decompose them into a set of short transactions.
|
||||
|
||||
If your operation really does require global consistency, you can often use
|
||||
an indirection strategy. Here, you write a separate version of the data during
|
||||
the course of the operation and switch to the new version when
|
||||
the operation is done. The switch can be performed transactionally with a single
|
||||
change of a reference.
|
||||
|
||||
For example, you can store the version reference using a ``'mostRecentVersion'``
|
||||
key::
|
||||
|
||||
@fdb.transactional
|
||||
def setMostRecentVersion(tr, versionNumber):
|
||||
tr[fdb.tuple.pack(('mostRecentVersion',))] = str(versionNumber)
|
||||
|
||||
@fdb.transactional
|
||||
def getMostRecentVersion(tr):
|
||||
return tr[fdb.tuple.pack(('mostRecentVersion',))]
|
||||
|
||||
Your application would then store the relevant data using keys that encode the
|
||||
version number. The application would read data with a transaction that reads the
|
||||
most recent version number and uses it to reference the correct data. This
|
||||
strategy has the advantage of allowing consistent access to the current version
|
||||
of the data while concurrently writing the new version.
|
||||
|
||||
Working with transactions
|
||||
=========================
|
||||
|
||||
.. _developer-guide-atomic-operations:
|
||||
|
||||
Atomic operations
|
||||
-----------------
|
||||
|
||||
|atomic-ops-blurb1|
|
||||
|
||||
Atomic operations are ideal for operating on keys that multiple clients modify frequently. For example, you can use a key as a counter and increment it with atomic :func:`add`::
|
||||
|
||||
@fdb.transactional
|
||||
def increment(tr, counter):
|
||||
tr.add(counter, struct.pack('<i', 1))
|
||||
|
||||
Similarly, you can use a key as a flag and toggle it with atomic :func:`xor`::
|
||||
|
||||
@fdb.transactional
|
||||
def toggle(tr, flag):
|
||||
tr.xor(flag, struct.pack('=?',1))
|
||||
|
||||
Each atomic operation takes a packed string as an argument (as detailed in the :ref:`API reference <api-python-transaction-atomic-operations>`).
|
||||
|
||||
|atomic-ops-blurb2| By combining its logical steps into a single, read-free operation, FoundationDB can guarantee that the transaction performing the atomic operation will not conflict due to that operation.
|
||||
|
||||
.. note :: |atomic-ops-warning|
|
||||
|
||||
.. _developer-guide-unknown-results:
|
||||
|
||||
Transactions with unknown results
|
||||
---------------------------------
|
||||
|
||||
|unknown-result-blurb|
|
||||
|
||||
.. note :: The Python ``@fdb.transactional`` decorator and its counterparts in the other language APIs do not check for :ref:`commit_unknown_result <developer-guide-error-codes>`.
|
||||
|
||||
An *idempotent* transaction is one that has the same effect when committed twice as when committed once. If your transaction is already idempotent, there is nothing more to worry about. Otherwise, you should consider whether it is acceptable for your transaction to be committed twice. The following suggestions are useful for making your transaction idempotent:
|
||||
|
||||
* Avoid generating IDs within the retry loop. Instead, create them prior to the loop and pass them in. For example, if your transaction records an account deposit with a deposit ID, generate the deposit ID outside of the loop.
|
||||
|
||||
* Within the retry loop, check for the completion of one of the transaction's unique side effects to determine if the whole transaction has previously completed. If the transaction doesn't naturally have such a side effect, you can create one by setting a unique key.
|
||||
|
||||
The following example illustrates both techniques. Together, they make a transaction idempotent that otherwise would not have been::
|
||||
|
||||
# Increases account balance and stores a record of the deposit with a unique depositId
|
||||
@fdb.transactional
|
||||
def deposit(tr, acctId, depositId, amount):
|
||||
|
||||
# If the deposit record exists, the deposit already succeeded, and we can quit
|
||||
depositKey = fdb.tuple.pack(('account', acctId, depositId))
|
||||
if tr[depositKey].present(): return
|
||||
|
||||
amount = struct.pack('<i', amount)
|
||||
tr[depositKey] = amount
|
||||
|
||||
# The above check ensures that the balance update is executed only once
|
||||
balanceKey = fdb.tuple.pack(('account', acctId))
|
||||
tr.add(balanceKey, amount)
|
||||
|
||||
.. _conflict-ranges:
|
||||
|
||||
Conflict ranges
|
||||
---------------
|
||||
|
||||
By default, FoundationDB transactions guarantee :ref:`serializable isolation <ACID>`, which results in a state that *could* have been produced by executing transactions one at a time, even though they may actually have been executed concurrently. FoundationDB maintains serializable isolation by detecting conflicts among concurrent transactions and allowing only a non-conflicting subset of them to succeed. Two concurrent transactions conflict if the first to commit writes a value that the second reads. In this case, the second transaction will fail. Clients will usually retry failed transactions.
|
||||
|
||||
To detect conflicts, FoundationDB tracks the ranges of keys each transaction reads and writes. While most applications will use the serializable isolation that transactions provide by default, FoundationDB also provides several API features that manipulate conflict ranges to allow more precise control.
|
||||
|
||||
Conflicts can be *avoided*, reducing isolation, in two ways:
|
||||
|
||||
* Instead of ordinary (serializable) reads, you can perform :ref:`snapshot reads <snapshot isolation>`, which do not add read conflict ranges.
|
||||
* You can use :ref:`transaction options <api-python-transaction-options>` to disable conflict ranges for writes.
|
||||
|
||||
Conflicts can be *created*, increasing isolation, by :ref:`explicitly adding <api-python-conflict-ranges>` read or write conflict ranges.
|
||||
|
||||
For example, suppose you have a transactional function that increments a set of counters using atomic addition. :ref:`developer-guide-atomic-operations` do not add read conflict ranges and so cannot cause the transaction in which they occur to fail. Most of the time, this is exactly what we want. However, suppose there is another transaction that (infrequently) resets one or more counters, and our contract requires that we must advance all specified counters in unison. We want to guarantee that if a counter is reset during an incrementing transaction, then the incrementing transaction will conflict. We can selectively add read conflicts ranges for this purpose::
|
||||
|
||||
@fdb.transactional
|
||||
def guarded_increment(tr, counters):
|
||||
for counter in counters:
|
||||
tr.add(counter, struct.pack('<i', 1))
|
||||
if reset(counter): tr.add_read_conflict_key(counter)
|
||||
|
||||
.. _snapshot isolation:
|
||||
|
||||
Snapshot reads
|
||||
--------------
|
||||
|
||||
|snapshot-blurb1|
|
||||
|
||||
The serializable isolation that transactions maintain by default has little performance cost when there are few conflicts but can be expensive when there are many. FoundationDB therefore also permits individual reads within a transaction to be done as snapshot reads. Snapshot reads differ from ordinary (serializable) reads by permitting the values they read to be modified by concurrent transactions, whereas serializable reads cause conflicts in that case.
|
||||
|
||||
Consider a transaction which needs to remove and return an arbitrary value from a small range of keys. The simplest implementation (using serializable isolation) would be::
|
||||
|
||||
@fdb.transactional
|
||||
def remove_one(tr, range):
|
||||
all_kv = tr[range]
|
||||
key, value = random.choice(list(all_kv))
|
||||
del tr[key]
|
||||
return value
|
||||
|
||||
Unfortunately, if a concurrent transaction happens to insert a new key anywhere in the range, our transaction will conflict with it and fail (resulting in a retry) because seeing the other transaction's write would change the result of the range read. FoundationDB is enforcing a stronger contract than we actually need. A snapshot read allows us to weaken the contract, but we don't want to weaken it too far: it's still important to us that the actual value we returned existed in the database at the time of our transaction. Adding a :ref:`conflict range <conflict-ranges>` for our key ensures that we will fail if someone else modifies the key simultaneously::
|
||||
|
||||
@fdb.transactional
|
||||
def remove_one(tr, range):
|
||||
all_kv = tr.snapshot[range] # Snapshot read
|
||||
key, value = random.choice(list(all_kv))
|
||||
tr.add_read_conflict_key(key) # Add conflict range
|
||||
del tr[key]
|
||||
return value
|
||||
|
||||
This transaction accomplishes the same task but won't conflict with the insert of a key elsewhere in the range. It will only conflict with a modification to the key it actually returns.
|
||||
|
||||
By default, snapshot reads see the effects of prior writes in the same transaction. (This read-your-writes behavior is the same as for ordinary, serializable reads.) Read-your-writes allows transactional functions (such as the above example) to be easily composed within a single transaction because each function will see the writes of previously invoked functions.
|
||||
|
||||
.. note::
|
||||
| The default read-your-writes behavior of snapshot reads is well-suited to the large majority of use cases. In less frequent cases, you may want to read from only a single version of the database. This behavior can be achieved through the appropriate :ref:`transaction options <api-python-snapshot-ryw>`. Transaction options are an advanced feature of the API and should be used with caution.
|
||||
|
|
||||
| Read-your-writes can be disabled (and re-enabled) within a transaction by using the options:
|
||||
|
|
||||
| * :meth:`Transaction.options.set_snapshot_ryw_disable`
|
||||
| * :meth:`Transaction.options.set_snapshot_ryw_enable`.
|
||||
|
|
||||
| A given snapshot read gets read-your-writes behavior unless the disable option has been previously set more times than the enable option in that transaction.
|
||||
|
|
||||
|
||||
Using snapshot reads is appropriate when the following conditions all hold:
|
||||
|
||||
* A particular read of frequently written values causes too many conflicts.
|
||||
* There isn't an easy way to reduce conflicts by splitting up data more granularly.
|
||||
* Any necessary invariants can be validated with added conflict ranges or more narrowly targeted serializable reads.
|
||||
|
||||
Transaction cancellation
|
||||
------------------------
|
||||
|
||||
The FoundationDB language bindings all provide a mechanism for cancelling an outstanding transaction. However there are also special transaction options for specifying the conditions under which a transaction should automatically be cancelled.
|
||||
|
||||
In the following example, a retry loop is combined with transaction options that ensure that the operation will not be attempted more than 6 times or for longer than 3 seconds::
|
||||
|
||||
@fdb.transactional
|
||||
def example_with_cancellation(tr):
|
||||
# Set maximum number of times that on_error() can be called (implicitly by the decorator).
|
||||
# On the 6th time, on_error() will throw retry_limit_exceeded rather than resetting and retrying.
|
||||
tr.options.set_retry_limit(5)
|
||||
# Cancel transaction with transaction_timed_out after 3 seconds
|
||||
tr.options.set_timeout(3000)
|
||||
|
||||
# Read two values from the transaction snapshot
|
||||
a = tr.get('a')
|
||||
b = tr.get('b')
|
||||
# Write two key-value pairs to the transaction snapshot
|
||||
tr.set('c', a+b)
|
||||
tr.set('d', a+b)
|
||||
|
||||
.. note:: The ``set_retry_limit()`` option sets a maximum number of *retries*, not tries. So the transaction above will at most be attempted a total of six times.
|
||||
|
||||
Watches
|
||||
-------
|
||||
|
||||
Sometimes you want a client to monitor one or more keys for updates to their values by other clients. An obvious way to implement monitoring is by polling, i.e., periodically reading the key-values to check them for a change. FoundationDB provides watches to monitor keys more efficiently. Watches are created for a specified key and return a :ref:`future <developer-guide-programming-with-futures>` that becomes ready when there's a change to the key's value.
|
||||
|
||||
For example, suppose you have a polling loop that checks keys for changes once a second::
|
||||
|
||||
def polling_loop(db, keys):
|
||||
|
||||
@fdb.transactional
|
||||
def read_keys(tr):
|
||||
return {k:tr[k] for k in keys}
|
||||
|
||||
cache = {k:object() for k in keys}
|
||||
while True:
|
||||
value = read_keys(db)
|
||||
for k in keys:
|
||||
if cache[k] != value[k]:
|
||||
yield value[k]
|
||||
cache[k] = value[k]
|
||||
time.sleep(1)
|
||||
|
||||
You can use the loop to dispatch changes to a handler with something like::
|
||||
|
||||
for k, v in polling_loop(db, ['foo','bar','bat']): handle(k,v)
|
||||
|
||||
With watches, you can eliminate the sleep and perform new reads only after a change to one of the keys::
|
||||
|
||||
def watching_read_loop(db, keys):
|
||||
|
||||
@fdb.transactional
|
||||
def watch_keys(tr):
|
||||
return {k:tr[k] for k in keys}, [tr.watch(k) for k in keys]
|
||||
|
||||
cache = {k:object() for k in keys}
|
||||
while True:
|
||||
value, watches = watch_keys(db)
|
||||
for k in keys:
|
||||
if cache[k] != value[k]:
|
||||
yield value[k]
|
||||
cache[k] = value[k]
|
||||
fdb.Future.wait_for_any(*watches)
|
||||
|
||||
The version with watches will perform fewer unnecessary reads and detect changes with better resolution than by polling.
|
||||
|
||||
.. note :: Watches guarantee only that a value was changed; they make no guarantee about values you may subsequently read. In particular, there is no guarantee that a value you read will correspond to the change that triggered the watch. Another client may have changed a key back to its original value or to some third value between a watch becoming ready and your subsequent read. For further details, see the discussion of watches in the language-specific document of your choice under :doc:`api-reference`.
|
||||
|
||||
If you only need to detect the *fact* of a change, and your response doesn't depend on the new *value*, then you can eliminate the reads altogether::
|
||||
|
||||
def watching_loop(db, keys):
|
||||
|
||||
@fdb.transactional
|
||||
def watch_keys(tr):
|
||||
return [tr.watch(k) for k in keys]
|
||||
|
||||
while True:
|
||||
fdb.Future.wait_for_any(*watch_keys(db))
|
||||
yield
|
||||
|
||||
.. _developer-guide-peformance-considerations:
|
||||
|
||||
Performance considerations
|
||||
==========================
|
||||
|
||||
Latency
|
||||
-------
|
||||
|
||||
Like all systems, FoundationDB operates at a low latency while under low load and an increasing latency as the load approaches the saturation point. We have made efforts to allow FoundationDB to operate at a low latency even at moderate loads. However, if FoundationDB is being driven with a "saturating" load (e.g. batch processing), latencies can be become very high as a line forms for requests. In this case, the transactions generating the saturating load should be run with a lower priority, allowing other transactions to skip ahead in line.
|
||||
|
||||
* For more information on setting transaction priorities, see the discussion of Transaction Options in the language-specific document of your choice under :doc:`api-reference`.
|
||||
|
||||
There are several places in a typical transaction that can experience database latency:
|
||||
|
||||
* **Starting the transaction**. This delay will be experienced as part of your first read (or part of ``getReadVersion()`` if using that API call). It will typically be a few milliseconds under moderate load, but under high write loads FoundationDB tries to concentrate most transaction latency here. This latency does not increase transaction conflicts (see :ref:`developer-guide-transaction-conflicts` below) since the transaction has not yet started.
|
||||
* **Individual reads**. These should take about 1 ms under moderate load on appropriate hardware. If a transaction performs many reads by waiting for each to complete before starting the next, however, these small latencies can add up. You can thus reduce transaction latency (and potentially conflicts) by doing as many of your reads as possible in parallel (i.e. by starting several reads before waiting on their results). See the :ref:`developer-guide-programming-with-futures` section of this document for an elegant way to achieve this.
|
||||
* **Committing the transaction**. Transactions that are not read-only must be committed, and the commit will not succeed until the transaction is fully (redundantly) durable. This takes time: averaging about 10 ms under normal loads with SSD hardware. This latency will be increased further in a geographically distributed system (in order to confirm that the transaction is durable in multiple datacenters). Only a small part of this latency impacts transaction conflicts.
|
||||
|
||||
Throughput requires concurrency
|
||||
-------------------------------
|
||||
|
||||
FoundationDB will only reach its maximum performance with a highly concurrent workload. This is a practical consideration that derives mathematically from the ratio of system throughput to system latency (known in queuing theory as `Little's Law <http://en.wikipedia.org/wiki/Little%27s_law>`_). For FoundationDB, a cluster might have a read latency of 1ms and be capable of millions of reads per second. To achieve such a rate, there must therefore be thousands of read requests happening concurrently. *Not having enough outstanding requests is the single biggest reason for low performance when using FoundationDB.*
|
||||
|
||||
There are several important techniques for achieving high concurrency:
|
||||
|
||||
* Whether your application does FoundationDB transactions in response to requests (as in web applications) or simply does transactions as fast as it can (as in a batch workload), make sure to run it with enough concurrent threads or processes---perhaps more than you would expect to provide optimal performance from experience with other database systems.
|
||||
* In many environments, there are cheaper (and sometimes less dangerous) alternatives to operating system threads for workloads that are bound by network latency. For example, in Python, the `gevent library <http://gevent.org/>`_ provides "coroutines" that have a simple thread-like programming model but are scheduled asynchronously in a single thread. FoundationDB's :doc:`Python API <api-python>` integrates with gevent and other language APIs have similar integrations. This can make it practical to run hundreds or thousands of concurrent transactions per core without much overhead. (If FoundationDB doesn't integrate with your favorite asynchronous programming tool, please let us know about it.)
|
||||
* Whenever possible, do multiple reads within a single transaction in parallel rather than sequentially. This reduces latency, and consequently reduces the number of concurrent transactions required to sustain a given throughput. See the :ref:`developer-guide-programming-with-futures` section of this document for an elegant way to achieve this.
|
||||
|
||||
.. _developer-guide-transaction-conflicts:
|
||||
|
||||
Minimizing conflicts
|
||||
---------------------
|
||||
|
||||
Frequent conflicts make FoundationDB operate inefficiently and should be minimized. They result from multiple clients trying to update the same keys at a high rate. Developers need to avoid this condition by spreading frequently updated data over a large set of keys.
|
||||
|
||||
.. note :: As a rule of thumb, if a key will be modified *more than 10-100 times per second*, a different data model should be considered.
|
||||
|
||||
In these situations:
|
||||
|
||||
* If the data stored in the key is large, consider :ref:`splitting it among multiple keys<largeval-splitting>` that can each be modified separately.
|
||||
* For a data structure like a counter, consider using :ref:`atomic operations <developer-guide-atomic-operations>` so that the write-only transactions do not conflict with each other. FoundationDB supports atomic operations for *addition*, *min*, *max*, bitwise *and*, bitwise *or*, and bitwise *xor*.
|
||||
* Consider performing selected reads as :ref:`snapshot reads <snapshot isolation>`, which eliminate conflicts with those reads but weaken transactional isolation.
|
||||
* For associative, commutative operations not supported as atomic operations, consider using adaptive sharding.
|
||||
* For operations that are order-dependent, consider inserting operations into a database queue prior to their execution by a subsequent transaction. The general pattern is to convert a read/modify/write operation on a logical value into an insertion into a list of changes to the value. Any client can then transactionally replace a portion of the list with its evaluated result. This pattern allows insertions to be decoupled from subsequent evaluations, which can take place in separate transactions. You can reach out on the `community forums <https://forums.foundationdb.org>`_ for help with finding the simplest solution to your actual use case.
|
||||
|
||||
.. _developer-guide-key-and-value-sizes:
|
||||
|
||||
Key and value sizes
|
||||
-------------------
|
||||
|
||||
Maintaining efficient key and value sizes is essential to getting maximum performance with FoundationDB. As a rule, smaller key sizes are better, as they can be more efficiently transmitted over the network and compared. In concrete terms, the highest performance applications will keep key sizes below 32 bytes. Key sizes above 10 kB are not allowed, and sizes above 1 kB should be avoided---store the data in the value if possible.
|
||||
|
||||
Value sizes are more flexible, with 0-10 kB normal. Value sizes cannot exceed 100 kB. Like any similar system, FoundationDB has a "characteristic value size" where the fixed costs of the random read roughly equal the marginal costs of the actual bytes. For FoundationDB running on SSD hardware, this characteristic size is roughly 1 kB for randomly accessed data and roughly 100 bytes for frequently accessed (cached) data.
|
||||
|
||||
If your keys or values are initially too large, try to revise your :doc:`data model <data-modeling>` to make them smaller.
|
||||
|
||||
Loading data
|
||||
------------
|
||||
|
||||
Loading data is a common task in any database. Loading data in FoundationDB will be most efficiently accomplished if a few guidelines are followed:
|
||||
|
||||
* Load small sequential chunks of the data set from random positions in the data set (to allow the system to efficiently distribute different data to different servers).
|
||||
* Do about 10KB of data in total writes per transaction.
|
||||
* Use about 50 concurrent transactions per loading process to allow efficient pipelining. You can increase the number of concurrent transactions as long as transaction latencies remain under about 1 second.
|
||||
* Use multiple processes loading in parallel if a single one is CPU-bound.
|
||||
|
||||
Using these techniques, our cluster of 24 nodes and 48 SSDs loads about 3 billion (100 byte) key-value pairs per hour.
|
|
@ -0,0 +1,49 @@
|
|||
#########
|
||||
Downloads
|
||||
#########
|
||||
|
||||
Client & Server Packages
|
||||
========================
|
||||
|
||||
FoundationDB packages are available on Artifactory for the following operating systems:
|
||||
|
||||
* `macOS <https://files.foundationdb.org/artifacts/5.1.0/release/osx/>`_. Supported on macOS >= 10.7. Installs client and (optionally) server.
|
||||
* `Ubuntu <https://files.foundationdb.org/artifacts/5.1.0/release/ubuntu/>`_. The server depends on the client. Supported on 64-bit Ubuntu >= 12.04, but beware of the Linux kernel bug in Ubuntu 12.x.
|
||||
* `RHEL/CentOS EL6 <https://files.foundationdb.org/artifacts/5.1.0/release/rhel6/>`_. The server depends on the client. Supported on 64-bit RHEL/CentOS (6.x).
|
||||
* `RHEL/CentOS EL7 <https://files.foundationdb.org/artifacts/5.1.0/release/rhel7/>`_. The server depends on the client. Supported on 64-bit RHEL/CentOS (7.x).
|
||||
|
||||
API Language Bindings
|
||||
=====================
|
||||
|
||||
C
|
||||
-
|
||||
|
||||
FoundationDB's C bindings are installed with the FoundationDB client binaries. You can find more details in the :doc:`C API Documentation <api-c>`.
|
||||
|
||||
Python 2.7 - 3.4
|
||||
----------------
|
||||
|
||||
The FoundationDB Python API is installed as part of your FoundationDB installation.
|
||||
|
||||
If you need to use the FoundationDB Python API from other Python installations or paths, download the `package <https://files.foundationdb.org/artifacts/5.1.0/release/python/>`_.
|
||||
|
||||
Ruby 1.9.3/3.0.2
|
||||
----------------
|
||||
|
||||
Download the `gem <https://files.foundationdb.org/artifacts/5.1.0/release/ruby/>`_.
|
||||
|
||||
Java JRE 1.8+
|
||||
-------------
|
||||
|
||||
Download the `jar and javadoc.jar <https://files.foundationdb.org/artifacts/5.1.0/release/java/>`_.
|
||||
|
||||
Node 0.8.x/0.10.x
|
||||
-----------------
|
||||
|
||||
Download the `node package <https://files.foundationdb.org/artifacts/5.1.0/release/nodejs/>`_.
|
||||
|
||||
|
||||
Go 1.1+
|
||||
-------
|
||||
|
||||
The FoundationDB Go package is available on `github <https://github.com/apple/foundationdb/tree/master/bindings/go>`_
|
|
@ -0,0 +1,15 @@
|
|||
#####################
|
||||
Earlier Release Notes
|
||||
#####################
|
||||
|
||||
Contained here is a historical list of release notes related to the FoundationDB key-value store
|
||||
that may be useful for determining when a feature was introduced. Earlier release notes refer
|
||||
to the product prior to its acquisition from Apple and were previously on the public web.
|
||||
Release notes after the acquisition often include links to the relevant Radars which
|
||||
were used to track the feature's development.
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
:glob:
|
||||
|
||||
old-release-notes/*
|
|
@ -0,0 +1,40 @@
|
|||
###########
|
||||
Engineering
|
||||
###########
|
||||
|
||||
When we built FoundationDB, we didn't just want it to make something that rivaled the competition, we wanted to go above and beyond. Below are examples of the extra lengths we've taken to make an incredible product.
|
||||
|
||||
Flow
|
||||
====
|
||||
|
||||
FoundationDB began with ambitious goals for both :doc:`high performance <performance>` per node and :doc:`scalability <scalability>`. We knew that to achieve these goals we would face serious engineering challenges that would require tool breakthroughs. We'd need efficient asynchronous communicating processes like in Erlang or the Async in .NET, but we'd also need the raw speed, I/O efficiency, and control of C++. To meet these challenges, we developed several new tools, the most important of which is :doc:`flow`, a new programming language that brings actor-based concurrency to C++11. Flow adds about 10 keywords to C++11 and is technically a trans-compiler: the Flow compiler reads Flow code and compiles it down to raw C++11, which is then compiled to a native binary with a traditional toolchain. One of Flow’s most important job is enabling Simulation.
|
||||
|
||||
Simulation
|
||||
==========
|
||||
|
||||
We wanted FoundationDB to survive failures of machines, networks, disks, clocks, racks, data centers, file systems, etc., so we created a simulation framework closely tied to Flow. By replacing physical interfaces with shims, replacing the main epoll-based run loop with a time-based simulation, and running multiple logical processes as concurrent Flow Actors, Simulation is able to conduct a deterministic simulation of an entire FoundationDB cluster within a single-thread! Even better, we are able to execute this simulation in a deterministic way, enabling us to reproduce problems and add instrumentation ex post facto. This incredible capability enabled us to build FoundationDB exclusively in simulation for the first 18 months and ensure exceptional fault tolerance long before it sent its first real network packet. For a database with as strong a contract as the FoundationDB, testing is crucial, and over the years we have run the equivalent of *a trillion CPU-hours* of simulated stress testing. Read more about our :doc:`Simulation and Testing <testing>`.
|
||||
|
||||
RateKeeper
|
||||
==========
|
||||
|
||||
FoundationDB uses an intelligent control algorithm called RateKeeper to queue client transactions during heavy loads. Using principles from operational research and control theory, FoundationDB prevents system oscillation and reduces internal queue sizes by intelligently applying global backpressure. By handing out tickets and serving clients in order and at a controlled pace, latency is shifted from the read and commit operations to the transaction-creation line. This ensures continuous low-latency operation under all conditions and also allows transactions of different priorities to be queued separately, allowing concurrent batch and low-latency workloads.
|
||||
|
||||
Prioritization
|
||||
==============
|
||||
|
||||
Every task inside of Flow, especially disk access and network use, has its own priority. Carefully controlling the order at which tasks are completed is crucial to shaping performance at both light and heavy workloads. As operations enter the system, their priority generally increases over time to ensure that all operations are completed within an equitable timeframe.
|
||||
|
||||
Range clears
|
||||
============
|
||||
|
||||
Other ordered key-value stores frequently provide range clears that perform on the order of the number of keys cleared, much like range reads do. To provide FoundationDB with the most predictability and performance, we worked hard to make range clears ultra-efficient, taking only O(log N) time which is for all practical purposes instant even for very large N. One in-house test clears an entire 10TB+ database with a single range clear. This gives the least-surprising behavior for a clear operation, even when your dataset has grown very large.
|
||||
|
||||
Range reads
|
||||
===========
|
||||
|
||||
Ordered key-value stores often provide range reads that perform on the order of the number of keys returned plus the number of keys recently cleared. The reason for this weakness is that it’s easier to use an asymptotically inefficient data structure that stores a large set of (data, version) pairs instead of a true multi-version representation. In FoundationDB, we use a persistent tree-type data structure to eliminate “tombstones” and similar hacks. This design allows range reads to truly be O(N) where N is the number of elements returned. For example, iterating over a recently cleared large range is efficient because the traversal skips the cleared range in a single step.
|
||||
|
||||
API versioning
|
||||
==============
|
||||
|
||||
Since the very beginning, FoundationDB has completely encapsulated multiple versions of its interface by requiring an explicit call to the “api_version” function before invoking any APIs. The goal of this design is to allow the server, client libraries, or bindings to be upgraded without having to modify client code at all. The client libraries, building on our C bindings, support all previous versions of all APIs.
|
|
@ -0,0 +1,36 @@
|
|||
###############
|
||||
Fault Tolerance
|
||||
###############
|
||||
|
||||
What is fault tolerance?
|
||||
========================
|
||||
|
||||
Many systems claim to be fault tolerant without really discussing the spectrum of failures they may face or their ability to maintain service in various cases. At one end of the spectrum, network packet collisions happen all the time with only minor impact; at the other end, if all the machines in a cluster lose power, the system will necessarily fail. The importance of fault tolerance lies in the middle range between these extremes. Fault tolerance is characterized by the *amount, duration, and likelihood* of data and service loss that may occur.
|
||||
|
||||
Distributed and replicated
|
||||
==========================
|
||||
|
||||
FoundationDB is built on a distributed shared-nothing architecture. This design gives us a huge advantage over any system running on a single computer, which must fail when the machine fails. FoundationDB divides the data into chunks and distributes multiple copies of each chunk to different storage servers, each of which is a separate physical computer with its own local storage. The only connection between the computers is a switched Ethernet network.
|
||||
|
||||
FoundationDB can tolerate a single computer failure with service interruption of at most a few seconds and no data loss, although maximum system throughput is affected. Single machine failures are one of the most common types of failure, so tolerance to such failure is essential.
|
||||
|
||||
Data distribution strategy
|
||||
==========================
|
||||
|
||||
If multiple machines fail, FoundationDB still deals gracefully with the possible service loss. Of course, data unavailability becomes a possibility because there are only a finite number of replicas of each chunk of data.
|
||||
|
||||
Any distributed system faces some basic probabilistic constraints. For example, take a system running on a 40-machine cluster. If each one of a million pieces of data is put on 4 random machines, and then 4 machines fail, unavailability of some data is almost certain. There are only about 100,000 possible combinations of 4 machines among 40, and with a million pieces of data, the failing combination of machines is almost certain to contain some of the million. (It will usually have about 10).
|
||||
|
||||
FoundationDB improves these probabilities by selecting "teams" of machines on which to distribute data. Instead of putting each chunk of data on a different set of machines, each machine can participate in multiple teams. In the above example, by selecting only 450 teams of 4 machines that each chunk of data can be on, the chance of data unavailability is reduced to about 0.5%.
|
||||
|
||||
Independence assumptions
|
||||
========================
|
||||
|
||||
As a further refinement, FoundationDB can be made aware that certain machines might tend to fail together. For example, every machine in a rack might share a network and power connection. If either failed, then the entire rack of machines would fail. We use this knowledge when choosing teams, taking care not to place any two machines in a team that would have a tendency to fail together. Pieces of data can then be intelligently distributed across racks or even datacenters, so that characteristic multimachine failures (for example, based on rack configuration) do not cause service interruption or data loss. Using this method, FoundationDB can continuously operate through a failure of an entire datacenter.
|
||||
|
||||
Other types of failure
|
||||
======================
|
||||
|
||||
There are many different types of failures: drives filling up, network routing errors, machine performance degradation, "dead" machines coming back to life, OS faults, etc. FoundationDB has been built from the ground up on a framework that allows simulation of all these types of failures. We've run hundreds of millions of stress tests that fail machines at very short intervals, induce unusually severe loads, delay communications channels at the worst time, or all of the above at once.
|
||||
|
||||
We have worked hard to design FoundationDB to maximize fault tolerance, maintaining performance and availability in the face of worst-case scenarios. As a result, FoundationDB is a very safe system for managing your data.
|
|
@ -0,0 +1,118 @@
|
|||
########
|
||||
Features
|
||||
########
|
||||
|
||||
FoundationDB has an ordered transactional API with useful properties and strong guarantees. Features in the core are deliberately kept to a minimum; data models and other abilities are exposed :doc:`via layers <architecture>`.
|
||||
|
||||
The Foundation
|
||||
==============
|
||||
|
||||
Scalable
|
||||
--------
|
||||
|
||||
FoundationDB adapts to efficiently support applications with diverse performance requirements. By using a shared-nothing distributed architecture, FoundationDB *scales out* by adding more machines to a cluster rather than just *scaling up* by increasing the capacity of a single machine. Best of all, the hard work of managing data redundancy, partitioning, caching, etc., is all handled automatically. Read more about our :doc:`scalability <scalability>`.
|
||||
|
||||
ACID transactions
|
||||
-----------------
|
||||
|
||||
All reads and writes in FoundationDB are accomplished using transactions. These transactions are fully ACID (Atomic, Consistent, Isolated, and Durable) and span multiple machines with high performance. FoundationDB's isolation is the highest available; transactions appear to occur sequentially. FoundationDB's durability is the strongest — all transactions are redundantly stored to disk before they are considered committed.
|
||||
|
||||
Fault tolerance
|
||||
---------------
|
||||
|
||||
A system designed to be distributed across many machines must be highly fault tolerant because the likelihood of hardware and network failures increases with the number of machines involved. FoundationDB has been designed and relentlessly tested to provide exceptionally high levels of fault tolerance. We've gone much further than designing for "no single point of failure". FoundationDB has also been designed and tested to guarantee that all ACID properties are preserved, even under catastrophic failures. Read more about our :doc:`fault tolerance <fault-tolerance>`.
|
||||
|
||||
Replicated Storage
|
||||
------------------
|
||||
|
||||
FoundationDB stores each piece of data on multiple servers. If a server containing one of the copies is lost, FoundationDB will automatically heal, finding a new location for the lost copy. For read operations, clients communicate directly to the servers with the replicas, requesting a specific version to ensure a consistent view of the data.
|
||||
|
||||
Ordered Key-Value API
|
||||
---------------------
|
||||
|
||||
Simple can be powerful. FoundationDB uses an ordered key-value data model (and richer data models are exposed via :doc:`layers <layer-concept>`. Each "row" within the database consists of a key that is used to reference the row and a value which stores data associated with the key. No specific format for the keys or values is required; they are simply binary data. Because keys are kept in lexicographical (sorted) order, ranges of key-value pairs can be read efficiently.
|
||||
|
||||
Watches
|
||||
-------
|
||||
|
||||
Clients can create transactional watches on keys to ensure that they are notified if the value changes. After a watch is registered, FoundationDB efficiently pushes change notifications to clients without polling.
|
||||
|
||||
Atomic Operations
|
||||
-----------------
|
||||
|
||||
FoundationDB includes support for specific "atomic operations" (e.g. Add) within a transaction to manipulate the value of a key without requiring the client to actually read the value. This makes these operations "zero-latency" and enables a variety of advanced data structures to be implemented more efficiently as layers.
|
||||
|
||||
OLTP and OLAP
|
||||
-------------
|
||||
|
||||
FoundationDB is optimized for online transaction processing (OLTP) workloads consisting of many small reads and writes. However, because it is an *ordered* key-value store, FoundationDB can use range reads to efficiently scan large swaths of data. Thus, FoundationDB can be effectively used for online analytical processing (OLAP) workloads as well.
|
||||
|
||||
Performance
|
||||
===========
|
||||
|
||||
Low, predictable latencies
|
||||
--------------------------
|
||||
|
||||
FoundationDB provides predictable throughput and low-latency random IO, even under workloads with unusual or erratically changing access patterns. Further, both FoundationDB's API and implementation have been designed to make it possible to understand the costs of the operations being executed. For example, clearing any range, even the entire database, is a fast operation.
|
||||
Read more about :doc:`performance <performance>`.
|
||||
|
||||
Load balancing
|
||||
--------------
|
||||
|
||||
FoundationDB achieves full utilization of a cluster under variable real-world workloads by using two major techniques. First, individual chunks of data are continuously moved from machine to machine to balance the load minute-to-minute. Second, on a faster time scale, individual requests can be redirected from a busy machine to a less-busy peer that also has a copy of the data. These techniques work together to optimize both throughput and latency.
|
||||
|
||||
Bursting
|
||||
--------
|
||||
|
||||
By deferring background work for later, FoundationDB can provide higher burst write speeds, often up to triple the steady-state speed. This ability to efficiently "absorb" work can last several minutes. The capability to buffer bursts of work allows FoundationDB to be provisioned without worrying about instantaneous load peaks, and to keep latencies low even when pushing "above 100%" load.
|
||||
|
||||
Distributed Caching
|
||||
-------------------
|
||||
|
||||
When your database scales out, you don't need a separate distributed caching layer: you already have one. FoundationDB uses the aggregate memory of the entire cluster to cache commonly accessed data. Unlike a caching tier such as memcached, FoundationDB's cache is completely synchronized to the database and provides all ACID guarantees.
|
||||
|
||||
Concurrency
|
||||
===========
|
||||
|
||||
Non-blocking
|
||||
------------
|
||||
|
||||
FoundationDB uses multiversion concurrency control to provide transactionally isolated reads without locking data or blocking writes. Optimistic concurrency control ensures that deadlocks are impossible and that slow or failing clients cannot interfere with the operation of the database.
|
||||
|
||||
Concurrent Connections
|
||||
----------------------
|
||||
|
||||
FoundationDB is not slowed by large numbers of concurrent client connections. Because it uses a threadless communications and concurrency model, FoundationDB does not have to create a thread per connection. This allows full performance even with hundreds of thousands of in-flight requests.
|
||||
|
||||
Interactive Transactions
|
||||
------------------------
|
||||
|
||||
FoundationDB transactions are true interactive sessions, unlike distributed databases that require stored procedures. This means that client code can make an iterative series of reads and writes over the network to execute complex transactions.
|
||||
|
||||
Operations
|
||||
==========
|
||||
|
||||
Elastic
|
||||
-------
|
||||
|
||||
A FoundationDB database can start on a single machine and be expanded to a cluster as load and circumstances require. Adding a machine is as easy as running another FoundationDB process, even during database operation and without any extra administration. Data is continuously re-partitioned in the background; no manual data distribution or sharding is required.
|
||||
|
||||
Datacenter Failover
|
||||
-------------------
|
||||
|
||||
FoundationDB can be configured to run multiple live redundant clusters in geographically diverse datacenters. Each datacenter contains a complete and fully up-to-date copy of all data in the system, allowing for minimal downtime even when an entire datacenter becomes unavailable.
|
||||
|
||||
Self Tuning
|
||||
-----------
|
||||
|
||||
FoundationDB has been designed so that many functions (such as data distribution, fault tolerance, incorporation of new nodes, performance tuning, etc.) are done automatically and require minimal management. Management tools allow configuration of parameters like replication policy, cluster topology, and data directories. A status monitoring tool lets you monitor cluster health and utilization of the cluster's physical resources.
|
||||
|
||||
Deploy Anywhere
|
||||
---------------
|
||||
|
||||
Because it can scale linearly by adding new machines, FoundationDB is an ideal database for deployment in public or private cloud environments. For best performance in cloud environments with limited I/O, FoundationDB can be configured to use a durable in-memory storage engine instead of its default SSD-optimized storage engine.
|
||||
|
||||
Backup
|
||||
------
|
||||
|
||||
An integrated backup system provides a true "moment-in-time" snapshot backup of the entire distributed database stored to a remote file system on a schedule. Although FoundationDB itself is fault tolerant, this capability is useful for recovering from disasters or unintentional modification of the database.
|
|
@ -0,0 +1,97 @@
|
|||
####
|
||||
Flow
|
||||
####
|
||||
|
||||
Engineering challenges
|
||||
======================
|
||||
|
||||
FoundationDB began with ambitious goals for both :doc:`high performance <performance>` per node and :doc:`scalability <scalability>`. We knew that to achieve these goals we would face serious engineering challenges while developing the FoundationDB core. We'd need to implement efficient asynchronous communicating processes of the sort supported by `Erlang <http://en.wikipedia.org/wiki/Erlang_(programming_language)>`_ or the `Async library in .NET <http://msdn.microsoft.com/en-us/library/vstudio/hh191443.aspx>`_, but we'd also need the raw speed and I/O efficiency of C++. Finally, we'd need to perform extensive simulation to engineer for reliability and fault tolerance on large clusters.
|
||||
|
||||
To meet these challenges, we developed several new tools, the first of which is Flow, a new programming language that brings `actor-based concurrency <http://en.wikipedia.org/wiki/Actor_model>`_ to C++11. To add this capability, Flow introduces a number of new keywords and control-flow primitives for managing concurrency. Flow is implemented as a compiler which analyzes an asynchronous function (actor) and rewrites it as an object with many different sub-functions that use callbacks to avoid blocking (see `streamlinejs <https://github.com/Sage/streamlinejs>`_ for a similar concept using JavaScript). The Flow compiler's output is normal C++11 code, which is then compiled to a binary using traditional tools. Flow also provides input to our simulation tool, which conducts deterministic simulations of the entire system, including its physical interfaces and failure modes. In short, Flow allows efficient concurrency within C++ in a maintainable and extensible manner, achieving all three major engineering goals:
|
||||
|
||||
* high performance (by compiling to native code),
|
||||
* actor-based concurrency (for high productivity development),
|
||||
* simulation support (for testing).
|
||||
|
||||
A first look
|
||||
============
|
||||
|
||||
Actors in Flow receive asynchronous messages from each other using a data type called a *future*. When an actor requires a data value to continue computation, it waits for it without blocking other actors. The following simple actor performs asynchronous addition. It takes a future integer and a normal integer as an offset, waits on the future integer, and returns the sum of the value and the offset:
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
ACTOR Future<int> asyncAdd(Future<int> f, int offset) {
|
||||
int value = wait( f );
|
||||
return value + offset;
|
||||
}
|
||||
|
||||
Flow features
|
||||
=============
|
||||
|
||||
Flow's new keywords and control-flow primitives support the capability to pass messages asynchronously between components. Here's a brief overview.
|
||||
|
||||
Promise<T> and Future<T>
|
||||
------------------------
|
||||
|
||||
The data types that connect asynchronous senders and receivers are ``Promise<T>`` and ``Future<T>`` for some C++ type ``T``. When a sender holds a ``Promise<T>``, it represents a promise to deliver a value of type ``T`` at some point in the future to the holder of the ``Future<T>``. Conversely, a receiver holding a ``Future<T>`` can asynchronously continue computation until the point at which it actually needs the ``T.``
|
||||
|
||||
Promises and futures can be used within a single process, but their real strength in a distributed system is that they can traverse the network. For example, one computer could create a promise/future pair, then send the promise to another computer over the network. The promise and future will still be connected, and when the promise is fulfilled by the remote computer, the original holder of the future will see the value appear.
|
||||
|
||||
wait()
|
||||
------
|
||||
|
||||
At the point when a receiver holding a ``Future<T>`` needs the ``T`` to continue computation, it invokes the ``wait()`` statement with the ``Future<T>`` as its parameter. The ``wait()`` statement allows the calling actor to pause execution until the value of the future is set, returning a value of type ``T`` During the wait, other actors can continue execution, providing asynchronous concurrency within a single process.
|
||||
|
||||
ACTOR
|
||||
-----
|
||||
|
||||
Only functions labeled with the ``ACTOR`` tag can call ``wait()``. Actors are the essential unit of asynchronous work and can be composed to create complex message-passing systems. By composing actors, futures can be chained together so that the result of one depends on the output of another.
|
||||
|
||||
An actor is declared as returning a ``Future<T>`` where ``T`` may be ``Void`` if the actor's return value is used only for signaling. Each actor is preprocessed into a C++11 class with internal callbacks and supporting functions.
|
||||
|
||||
State
|
||||
-----
|
||||
|
||||
The ``state`` keyword is used to scope a variable so that it is visible across multiple ``wait()`` statements within an actor. The use of a ``state`` variable is illustrated in the example actor below.
|
||||
|
||||
PromiseStream<T>, FutureStream<T>
|
||||
---------------------------------
|
||||
|
||||
When a component wants to work with a *stream* of asynchronous messages rather than a single message, it can use ``PromiseStream<T>`` and ``FutureStream<T>``. These constructs allow for two important features: multiplexing and reliable delivery of messages. They also play an important role in Flow design patterns. For example, many of the servers in FoundationDB expose their interfaces as a ``struct`` of promise streams—one for each request type.
|
||||
|
||||
waitNext()
|
||||
----------
|
||||
|
||||
``waitNext()`` is the counterpart of ``wait()`` for streams. It pauses program execution and waits for the next value in a ``FutureStream``. If there is a value ready in the stream, execution continues without delay.
|
||||
|
||||
choose . . . when
|
||||
-----------------
|
||||
|
||||
The ``choose`` and ``when`` constructs allow an actor to wait for multiple futures at once in a ordered and predictable way.
|
||||
|
||||
Example: A Server Interface
|
||||
---------------------------
|
||||
|
||||
Below is a actor that runs on single server communicating over the network. Its functionality is to maintain a count in response to asynchronous messages from other actors. It supports an interface implemented with a loop containing a ``choose`` statement with a ``when`` for each request type. Each ``when`` uses ``waitNext()`` to asynchronously wait for the next request in the stream. The add and subtract interfaces modify the count itself, stored with a state variable. The get interface takes a ``Promise<int>`` instead of just an ``int`` to facilitate sending back the return message.
|
||||
|
||||
To write the equivalent code directly in C++, a developer would have to implement a complex set of callbacks with exception-handling, requiring far more engineering effort. Flow makes it much easier to implement this sort of asynchronous coordination, with no loss of performance.:
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
ACTOR void serveCountingServerInterface(
|
||||
CountingServerInterface csi) {
|
||||
state int count = 0;
|
||||
while (1) {
|
||||
choose {
|
||||
when (int x = waitNext(csi.addCount.getFuture())){
|
||||
count += x;
|
||||
}
|
||||
when (int x = waitNext(csi.subtractCount.getFuture())){
|
||||
count -= x;
|
||||
}
|
||||
when (Promise<int> r = waitNext(csi.getCount.getFuture())){
|
||||
r.send( count ); // goes to client
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,114 @@
|
|||
.. _getting-started-linux:
|
||||
|
||||
########################
|
||||
Getting Started on Linux
|
||||
########################
|
||||
|
||||
.. include:: guide-common.rst.inc
|
||||
|
||||
This guide walks through installing a locally accessible FoundationDB server that is suitable for development on Linux.
|
||||
|
||||
To install an externally accessible FoundationDB cluster on one or more machines, see :doc:`building-cluster`.
|
||||
|
||||
First steps
|
||||
===========
|
||||
|
||||
* Validate your system meets the :ref:`system-requirements`.
|
||||
|
||||
* Download the FoundationDB packages for your system from :doc:`downloads`.
|
||||
|
||||
* Before upgrading from a previous version of FoundationDB, see :ref:`upgrading-foundationdb`.
|
||||
|
||||
Installing or upgrading FoundationDB packages
|
||||
=============================================
|
||||
|
||||
.. warning:: |upgrade-client-server-warning|
|
||||
|
||||
To install on **Ubuntu** use the dpkg command:
|
||||
|
||||
.. parsed-literal::
|
||||
|
||||
user@host$ sudo dpkg -i |package-deb-clients| \\
|
||||
|package-deb-server|
|
||||
|
||||
To install on **RHEL/CentOS 6** use the rpm command:
|
||||
|
||||
.. parsed-literal::
|
||||
|
||||
user@host$ sudo rpm -Uvh |package-rpm-clients| \\
|
||||
|package-rpm-server|
|
||||
|
||||
To install on **RHEL/CentOS 7** use the rpm command:
|
||||
|
||||
.. parsed-literal::
|
||||
|
||||
user@host$ sudo rpm -Uvh |package-rpm-clients| \\
|
||||
|package-rpm-server|
|
||||
|
||||
|simple-installation-mode-warnings|
|
||||
|
||||
|networking-clarification|
|
||||
|
||||
Testing your FoundationDB installation
|
||||
======================================
|
||||
|
||||
To verify that the local FoundationDB database is operational, open the command line interface (``fdbcli``) and use the status command. ::
|
||||
|
||||
user@host$ fdbcli
|
||||
Using cluster file `/etc/foundationdb/fdb.cluster'.
|
||||
|
||||
The database is available.
|
||||
|
||||
Welcome to the fdbcli. For help, type `help'.
|
||||
fdb> status
|
||||
|
||||
Configuration:
|
||||
Redundancy mode - single
|
||||
Storage engine - memory
|
||||
Coordinators - 1
|
||||
|
||||
Cluster:
|
||||
FoundationDB processes - 1
|
||||
Machines - 1
|
||||
Memory availability - 4.1 GB per process on machine with least available
|
||||
Fault Tolerance - 0 machines
|
||||
Server time - Wed Oct 8 14:41:34 2014
|
||||
|
||||
Data:
|
||||
Replication health - Healthy
|
||||
Moving data - 0.000 GB
|
||||
Sum of key-value sizes - 0 MB
|
||||
|
||||
Operating space:
|
||||
Storage server - 1.0 GB free on most full server
|
||||
Transaction log - 1.0 GB free on most full server
|
||||
|
||||
Workload:
|
||||
Read rate - 2 Hz
|
||||
Write rate - 0 Hz
|
||||
Transactions started - 2 Hz
|
||||
Transactions committed - 0 Hz
|
||||
Conflict rate - 0 Hz
|
||||
|
||||
Client time: Thu Nov 20 09:50:45 2014
|
||||
|
||||
If these steps were successful you have installed and validated FoundationDB. You can now start using the database!
|
||||
|
||||
.. note:: If the database is not operational the ``status`` command will provide diagnostic information to help you resolve the issue.
|
||||
|
||||
Managing the FoundationDB service
|
||||
==================================
|
||||
|
||||
* See :ref:`administration-running-foundationdb`.
|
||||
* See :ref:`administration-removing`.
|
||||
|
||||
Next steps
|
||||
==========
|
||||
|
||||
* Install the APIs for :doc:`Ruby <api-ruby>`, `Java <javadoc/index.html>`_, or :doc:`Node.js <api-node>` if you intend to use those languages. :doc:`Python <api-python>` and :doc:`C <api-c>` APIs were installed along with the ``foundationdb-clients`` package above.
|
||||
* See :doc:`tutorials` for samples of developing applications with FoundationDB.
|
||||
* See :doc:`developer-guide` for information of interest to developers, including common design patterns and performance considerations.
|
||||
* See :doc:`administration` for detailed administration information.
|
||||
* See :doc:`known-limitations` of the system.
|
||||
* See :doc:`building-cluster` for step-by-step instructions on converting your local single-machine cluster to an externally visible cluster of one or more machines.
|
||||
|
|
@ -0,0 +1,104 @@
|
|||
.. _getting-started-mac:
|
||||
|
||||
########################
|
||||
Getting Started on macOS
|
||||
########################
|
||||
|
||||
.. include:: guide-common.rst.inc
|
||||
|
||||
This guide walks through installing a locally accessible FoundationDB server that is suitable for development on macOS.
|
||||
|
||||
.. note:: |platform-not-supported-for-production|
|
||||
|
||||
First steps
|
||||
===========
|
||||
|
||||
* Validate that your system has
|
||||
|
||||
* x86-64 processor architecture
|
||||
* 4 GB RAM (per process)
|
||||
* macOS 10.7 or newer
|
||||
|
||||
* Download the FoundationDB packages for your system from :doc:`downloads`.
|
||||
|
||||
* Before upgrading from a previous version of FoundationDB, see :ref:`upgrading-foundationdb`.
|
||||
|
||||
Installing or upgrading FoundationDB
|
||||
====================================
|
||||
|
||||
To begin installation, double-click on |package-mac|. Follow the instructions and select the components that you want to install.
|
||||
|
||||
Client-only installation
|
||||
------------------------
|
||||
|
||||
By default, the FoundationDB installer installs the binaries required to run both clients and a local development server. If you don't intend to run the FoundationDB server on your machine, you can deselect the "FoundationDB Server" option. Copy the :ref:`cluster file <foundationdb-cluster-file>` from a server or client in the cluster you want to connect to and place it in ``/usr/local/etc/foundationdb/``.
|
||||
|
||||
Other considerations
|
||||
--------------------
|
||||
|
||||
|simple-installation-mode-warnings|
|
||||
|
||||
|networking-clarification|
|
||||
|
||||
Testing your FoundationDB installation
|
||||
======================================
|
||||
|
||||
To verify that the local FoundationDB database is operational, open the command line interface (``fdbcli``) and use the status command. ::
|
||||
|
||||
host:~ user$ fdbcli
|
||||
Using cluster file `/usr/local/etc/foundationdb/fdb.cluster'.
|
||||
|
||||
The database is available.
|
||||
|
||||
Welcome to the fdbcli. For help, type `help'.
|
||||
fdb> status
|
||||
|
||||
Configuration:
|
||||
Redundancy mode - single
|
||||
Storage engine - memory
|
||||
Coordinators - 1
|
||||
|
||||
Cluster:
|
||||
FoundationDB processes - 1
|
||||
Machines - 1
|
||||
Memory availability - 4.1 GB per process on machine with least available
|
||||
Fault Tolerance - 0 machines
|
||||
Server time - Wed Oct 8 14:41:34 2014
|
||||
|
||||
Data:
|
||||
Replication health - Healthy
|
||||
Moving data - 0.000 GB
|
||||
Sum of key-value sizes - 0 MB
|
||||
|
||||
Operating space:
|
||||
Storage server - 1.0 GB free on most full server
|
||||
Transaction log - 1.0 GB free on most full server
|
||||
|
||||
Workload:
|
||||
Read rate - 2 Hz
|
||||
Write rate - 0 Hz
|
||||
Transactions started - 2 Hz
|
||||
Transactions committed - 0 Hz
|
||||
Conflict rate - 0 Hz
|
||||
|
||||
Client time: Thu Nov 20 09:56:52 2014
|
||||
|
||||
If these steps were successful you have installed and validated FoundationDB. You can now start using the database!
|
||||
|
||||
.. note:: If the database is not operational the ``status`` command will provide diagnostic information to help you resolve the issue.
|
||||
|
||||
Managing the FoundationDB service
|
||||
==================================
|
||||
|
||||
* See :ref:`administration-running-foundationdb`.
|
||||
* See :ref:`administration-removing`.
|
||||
|
||||
Next steps
|
||||
==========
|
||||
|
||||
* Install the APIs for :doc:`Ruby <api-ruby>`, `Java <javadoc/index.html>`_, or :doc:`Node.js <api-node>` if you intend to use those languages. :doc:`Python <api-python>` and :doc:`C <api-c>` APIs were installed using the FoundationDB installer above.
|
||||
* See :doc:`tutorials` for samples of developing applications with FoundationDB.
|
||||
* See :doc:`developer-guide` for information of interest to developers, including common design patterns and performance considerations.
|
||||
* See :doc:`administration` for detailed administration information.
|
||||
* See :doc:`known-limitations` of the system.
|
||||
|
|
@ -0,0 +1,58 @@
|
|||
.. -*- mode: rst; -*-
|
||||
|
||||
.. |platform-not-supported-for-production| replace::
|
||||
This version of FoundationDB server is intended only for locally available operation for development purposes. Externally available operation, performance testing, and production use are supported by the :doc:`Linux<getting-started-linux>` version.
|
||||
|
||||
.. |cluster-file-rule1| replace::
|
||||
The ``description`` is a logical description of the database using alphanumeric characters (a-z, A-Z, 0-9) and underscores.
|
||||
|
||||
.. |cluster-file-rule2| replace::
|
||||
The ``ID`` is an arbitrary value containing alphanumeric characters (a-z, A-Z, 0-9). We recommend using a random eight-character identifier (such as the output of ``mktemp -u XXXXXXXX``).
|
||||
|
||||
.. |cluster-file-rule3| replace::
|
||||
The list of ``IP:PORT`` pairs specify the set of coordination servers. A majority of these servers must be available for the database to be operational so they should be chosen carefully. The number of coordination servers should therefore be odd and must be more than one to support fault-tolerance. We recommend using five coordination servers when using ``triple`` mode to maintain the ability to tolerate two simultaneous machine failures.
|
||||
|
||||
.. |simple-installation-mode-warnings| replace::
|
||||
FoundationDB installs in a single-server mode appropriate for a development workstation. In this mode data is not replicated, and therefore the database is **not failure tolerant**. This default database is also configured to use the memory storage engine which, while using the disk for durability, **requires data to fit in RAM**.
|
||||
|
||||
.. |networking-clarification| replace::
|
||||
By default FoundationDB uses the loopback IP (``127.0.0.1``). In this configuration all parts of FoundationDB, **including client applications**, must run on the same machine and communicate via ``127.0.0.1``, not via external IPs.
|
||||
|
||||
.. |development-use-only-warning| replace::
|
||||
The macOS version of the FoundationDB server is intended for single-machine development use only; its use in multi-machine clusters is not supported. In the present release, the Linux version is the best-tested and most performant platform for multi-machine clusters.
|
||||
|
||||
.. |upgrade-client-server-warning| replace::
|
||||
Unless using the :ref:`multi-version-client-api`, the installed client and server packages must have the same version. When upgrading a FoundationDB cluster, be sure also to upgrade all clients to the same version.
|
||||
|
||||
.. |optimize-configuration| replace::
|
||||
The default installation runs only one FoundationDB server process per machine (which will use only one CPU core). Most users of multi-machine configurations will want to maximize performance by running one FoundationDB server process per core. This is accomplished by modifying the :ref:`configuration file <foundationdb-conf>` (located at ``/etc/foundationdb/foundationdb.conf``) to have ``[fdbserver.<ID>]`` sections for each core. Note that 4GiB ECC RAM are required per FoundationDB server process (see :ref:`system-requirements`).
|
||||
|
||||
.. |coordinators-auto| replace::
|
||||
``coordinators auto`` selects processes based on IP address. If your cluster has processes on the same machine with different IP addresses, ``coordinators auto`` may select a set of coordinators that are not fault tolerant. To ensure maximal fault tolerance, we recommend selecting coordinators according to the criteria in :ref:`configuration-choosing-coordination-servers` and setting them manually.
|
||||
|
||||
.. |conf-file-change-detection| replace::
|
||||
Whenever the ``foundationdb.conf`` file changes, the ``fdbmonitor`` daemon automatically detects the changes and starts, stops, or restarts child processes as necessary.
|
||||
|
||||
.. |package-deb-clients| replace::
|
||||
foundationdb-clients\_\ |release|\ -1\_amd64.deb
|
||||
|
||||
.. |package-deb-server| replace::
|
||||
foundationdb-server\_\ |release|\ -1\_amd64.deb
|
||||
|
||||
.. |package-rpm6-clients| replace::
|
||||
foundationdb-clients-|release|\ -1.el6.x86_64.rpm
|
||||
|
||||
.. |package-rpm6-server| replace::
|
||||
foundationdb-server-|release|\ -1.el6.x86_64.rpm
|
||||
|
||||
.. |package-rpm-clients| replace::
|
||||
foundationdb-clients-|release|\ -1.el7.x86_64.rpm
|
||||
|
||||
.. |package-rpm-server| replace::
|
||||
foundationdb-server-|release|\ -1.el7.x86_64.rpm
|
||||
|
||||
.. |package-mac| replace::
|
||||
FoundationDB-|release|.pkg
|
||||
|
||||
.. |package-win| replace::
|
||||
foundationdb-|release|\ -x64.msi
|
|
@ -0,0 +1,240 @@
|
|||
######################
|
||||
Hierarchical Documents
|
||||
######################
|
||||
|
||||
:doc:`Python <hierarchical-documents>` **Java**
|
||||
|
||||
Goal
|
||||
====
|
||||
|
||||
Create a representation for hierarchical `documents <http://en.wikipedia.org/wiki/Document-oriented_database>`_.
|
||||
|
||||
Challenge
|
||||
=========
|
||||
|
||||
Support efficient storage and retrieval of documents, both as a whole and by subdocuments specified by paths.
|
||||
|
||||
Explanation
|
||||
===========
|
||||
|
||||
A hierarchical document has a tree-like structure, with the document ID as the root. We'll map the hierarchy to a list of tuples in which each tuple corresponds to a path from the root to a leaf. These tuples will form keys, so each leaf is indexed by the path leading to it.
|
||||
|
||||
Ordering
|
||||
========
|
||||
|
||||
Because each tuple represents a path from the document root to a leaf, the lexicographic ordering of tuples guarantees that adjacent paths will be stored in adjacent keys. Each tuple prefix will correspond to a subdocument that can be retrieved using a prefix range read. Likewise, a range read using the root as a prefix will retrieve the entire document.
|
||||
|
||||
Pattern
|
||||
=======
|
||||
|
||||
A document will consist of a dictionary whose values may be simple data types (e.g., integers or strings), lists, or (nested) dictionaries. Each document will be stored under a unique ID. If a document ID has not already been supplied, we randomly generate one.
|
||||
|
||||
We convert the document to a list of tuples representing each path from the root to a leaf. Each tuple is used to construct a composite key within a subspace. The document ID becomes the first element after the subspace prefix, followed by the remainder of the path. We store the leaf (the last element of the tuple) as the value, which enables storage of larger data sizes (see :ref:`Key and value sizes <data-modeling-performance-guidelines>`).
|
||||
|
||||
If we're given a serialized JSON object to start with, we just deserialize it before converting it to tuples. To distinguish the list elements in the document (a.k.a. JSON arrays) from dictionary elements and preserve the order of the lists, we include the index of each list element before it in the tuple.
|
||||
|
||||
We can retrieve any subdocument based on the partial path to its root. The partial path will just be a tuple that the query function uses as a key prefix for a range read. The retrieved data will be a list of tuples. The final step before returning the data is to convert it back to a document.
|
||||
|
||||
Extensions
|
||||
==========
|
||||
|
||||
Indexing
|
||||
--------
|
||||
|
||||
We could extend the document model to allow selective indexing of keys or values at specified locations with a document.
|
||||
|
||||
Query language
|
||||
--------------
|
||||
|
||||
We could extend the document to support more powerful query capabilities, either with query functions or a full query language. Either would be designed to take advantage of existing indexes.
|
||||
|
||||
Code
|
||||
====
|
||||
|
||||
Here’s a basic implementation of the recipe.
|
||||
|
||||
.. code-block:: java
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
|
||||
public class MicroDoc {
|
||||
|
||||
private static final FDB fdb;
|
||||
private static final Database db;
|
||||
private static final Subspace docSpace;
|
||||
private static final long EMPTY_OBJECT = -2;
|
||||
private static final long EMPTY_ARRAY = -1;
|
||||
|
||||
static {
|
||||
fdb = FDB.selectAPIVersion(510);
|
||||
db = fdb.open();
|
||||
docSpace = new Subspace(Tuple.from("D"));
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
private static ArrayList<Tuple> toTuplesSwitch(Object o){
|
||||
if(o instanceof ArrayList){
|
||||
return toTuples((ArrayList<Object>) o);
|
||||
} else if(o instanceof Map){
|
||||
return toTuples((Map<Object,Object>) o);
|
||||
} else {
|
||||
return toTuples(o);
|
||||
}
|
||||
}
|
||||
|
||||
private static ArrayList<Tuple> toTuples(ArrayList<Object> item){
|
||||
if(item.isEmpty()){
|
||||
ArrayList<Tuple> val = new ArrayList<Tuple>();
|
||||
val.add(Tuple.from(EMPTY_ARRAY, null));
|
||||
return val;
|
||||
} else {
|
||||
ArrayList<Tuple> val = new ArrayList<Tuple>();
|
||||
for(int i = 0; i < item.size(); i++){
|
||||
for(Tuple sub : toTuplesSwitch(item.get(i))){
|
||||
val.add(Tuple.from(i).addAll(sub));
|
||||
}
|
||||
}
|
||||
return val;
|
||||
}
|
||||
}
|
||||
|
||||
private static ArrayList<Tuple> toTuples(Map<Object,Object> item){
|
||||
if(item.isEmpty()){
|
||||
ArrayList<Tuple> val = new ArrayList<Tuple>();
|
||||
val.add(Tuple.from(EMPTY_OBJECT, null));
|
||||
return val;
|
||||
} else {
|
||||
ArrayList<Tuple> val = new ArrayList<Tuple>();
|
||||
for(Entry<Object,Object> e : item.entrySet()){
|
||||
for(Tuple sub : toTuplesSwitch(e.getValue())){
|
||||
val.add(Tuple.from(e.getKey()).addAll(sub));
|
||||
}
|
||||
}
|
||||
return val;
|
||||
}
|
||||
}
|
||||
|
||||
private static ArrayList<Tuple> toTuples(Object item){
|
||||
ArrayList<Tuple> val = new ArrayList<Tuple>();
|
||||
val.add(Tuple.from(item));
|
||||
return val;
|
||||
}
|
||||
|
||||
private static ArrayList<Tuple> getTruncated(ArrayList<Tuple> vals){
|
||||
ArrayList<Tuple> list = new ArrayList<Tuple>();
|
||||
for(Tuple val : vals){
|
||||
list.add(val.popFront());
|
||||
}
|
||||
return list;
|
||||
}
|
||||
|
||||
private static Object fromTuples(ArrayList<Tuple> tuples){
|
||||
if(tuples == null){
|
||||
return null;
|
||||
}
|
||||
|
||||
Tuple first = tuples.get(0); // Determine kind of object from
|
||||
// first tuple.
|
||||
if(first.size() == 1){
|
||||
return first.get(0); // Primitive type.
|
||||
}
|
||||
|
||||
if(first.equals(Tuple.from(EMPTY_OBJECT, null))){
|
||||
return new HashMap<Object,Object>(); // Empty map.
|
||||
}
|
||||
|
||||
if(first.equals(Tuple.from(EMPTY_ARRAY))){
|
||||
return new ArrayList<Object>(); // Empty list.
|
||||
}
|
||||
|
||||
HashMap<Object,ArrayList<Tuple>> groups = new HashMap<Object,ArrayList<Tuple>>();
|
||||
for(Tuple t : tuples){
|
||||
if(groups.containsKey(t.get(0))){
|
||||
groups.get(t.get(0)).add(t);
|
||||
} else {
|
||||
ArrayList<Tuple> list = new ArrayList<Tuple>();
|
||||
list.add(t);
|
||||
groups.put(t.get(0),list);
|
||||
}
|
||||
}
|
||||
|
||||
if(first.get(0).equals(0l)){
|
||||
// Array.
|
||||
ArrayList<Object> array = new ArrayList<Object>();
|
||||
for(Entry<Object,ArrayList<Tuple>> g : groups.entrySet()){
|
||||
array.add(fromTuples(getTruncated(g.getValue())));
|
||||
}
|
||||
return array;
|
||||
} else {
|
||||
// Object.
|
||||
HashMap<Object,Object> map = new HashMap<Object,Object>();
|
||||
for(Entry<Object,ArrayList<Tuple>> g : groups.entrySet()){
|
||||
map.put(g.getKey(), fromTuples(getTruncated(g.getValue())));
|
||||
}
|
||||
return map;
|
||||
}
|
||||
}
|
||||
|
||||
public static Object insertDoc(TransactionContext tcx, final Map<Object,Object> doc){
|
||||
return tcx.run(new Function<Transaction,Object>() {
|
||||
public Object apply(Transaction tr){
|
||||
if(!doc.containsKey("doc_id")){
|
||||
doc.put("doc_id", getNewID(tr));
|
||||
}
|
||||
for(Tuple t : toTuples(doc)){
|
||||
tr.set(docSpace.pack(Tuple.from(doc.get("doc_id")).addAll(t.popBack())),
|
||||
Tuple.from(t.get(t.size() - 1)).pack());
|
||||
}
|
||||
return doc.get("doc_id");
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
public static Object getDoc(TransactionContext tcx, final Object ID){
|
||||
return getDoc(tcx, ID, Tuple.from());
|
||||
}
|
||||
|
||||
public static Object getDoc(TransactionContext tcx, final Object ID, final Tuple prefix){
|
||||
return tcx.run(new Function<Transaction,Object>() {
|
||||
public Object apply(Transaction tr){
|
||||
Future<byte[]> v = tr.get(docSpace.pack(Tuple.from(ID).addAll(prefix)));
|
||||
if(v.get() != null){
|
||||
// One single item.
|
||||
ArrayList<Tuple> vals = new ArrayList<Tuple>();
|
||||
vals.add(prefix.addAll(Tuple.fromBytes(v.get())));
|
||||
return fromTuples(vals);
|
||||
} else {
|
||||
// Multiple items.
|
||||
ArrayList<Tuple> vals = new ArrayList<Tuple>();
|
||||
for(KeyValue kv : tr.getRange(docSpace.range(Tuple.from(ID).addAll(prefix)))){
|
||||
vals.add(docSpace.unpack(kv.getKey()).popFront().addAll(Tuple.fromBytes(kv.getValue())));
|
||||
}
|
||||
return fromTuples(vals);
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
private static int getNewID(TransactionContext tcx){
|
||||
return tcx.run(new Function<Transaction,Integer>() {
|
||||
@SuppressWarnings("unused")
|
||||
public Integer apply(Transaction tr){
|
||||
boolean found = false;
|
||||
int newID;
|
||||
do {
|
||||
newID = (int)(Math.random()*100000000);
|
||||
found = true;
|
||||
for(KeyValue kv : tr.getRange(docSpace.range(Tuple.from(newID)))){
|
||||
// If not empty, this is false.
|
||||
found = false;
|
||||
break;
|
||||
}
|
||||
} while(!found);
|
||||
return newID;
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
|
@ -0,0 +1,120 @@
|
|||
######################
|
||||
Hierarchical Documents
|
||||
######################
|
||||
|
||||
**Python** :doc:`Java <hierarchical-documents-java>`
|
||||
|
||||
Goal
|
||||
====
|
||||
|
||||
Create a representation for hierarchical `documents <http://en.wikipedia.org/wiki/Document-oriented_database>`_.
|
||||
|
||||
Challenge
|
||||
=========
|
||||
|
||||
Support efficient storage and retrieval of documents, both as a whole and by subdocuments specified by paths.
|
||||
|
||||
Explanation
|
||||
===========
|
||||
|
||||
A hierarchical document has a tree-like structure, with the document ID as the root. We'll map the hierarchy to a list of tuples in which each tuple corresponds to a path from the root to a leaf. These tuples will form keys, so each leaf is indexed by the path leading to it.
|
||||
|
||||
Ordering
|
||||
========
|
||||
|
||||
Because each tuple represents a path from the document root to a leaf, the lexicographic ordering of tuples guarantees that adjacent paths will be stored in adjacent keys. Each tuple prefix will correspond to a subdocument that can be retrieved using a prefix range read. Likewise, a range read using the root as a prefix will retrieve the entire document.
|
||||
|
||||
Pattern
|
||||
=======
|
||||
|
||||
A document will consist of a dictionary whose values may be simple data types (e.g., integers or strings), lists, or (nested) dictionaries. Each document will be stored under a unique ID. If a document ID has not already been supplied, we randomly generate one.
|
||||
|
||||
We convert the document to a list of tuples representing each path from the root to a leaf. Each tuple is used to construct a composite key within a subspace. The document ID becomes the first element after the subspace prefix, followed by the remainder of the path. We store the leaf (the last element of the tuple) as the value, which enables storage of larger data sizes (see :ref:`Key and value sizes <data-modeling-performance-guidelines>`).
|
||||
|
||||
If we're given a serialized JSON object to start with, we just deserialize it before converting it to tuples. To distinguish the list elements in the document (a.k.a. JSON arrays) from dictionary elements and preserve the order of the lists, we include the index of each list element before it in the tuple.
|
||||
|
||||
We can retrieve any subdocument based on the partial path to its root. The partial path will just be a tuple that the query function uses as a key prefix for a range read. The retrieved data will be a list of tuples. The final step before returning the data is to convert it back to a document.
|
||||
|
||||
Extensions
|
||||
==========
|
||||
|
||||
Indexing
|
||||
--------
|
||||
|
||||
We could extend the document model to allow selective indexing of keys or values at specified locations with a document.
|
||||
|
||||
Query language
|
||||
--------------
|
||||
|
||||
We could extend the document to support more powerful query capabilities, either with query functions or a full query language. Either would be designed to take advantage of existing indexes.
|
||||
|
||||
Code
|
||||
====
|
||||
|
||||
Here’s a basic implementation of the recipe.
|
||||
::
|
||||
|
||||
import itertools
|
||||
import json
|
||||
import random
|
||||
|
||||
doc_space = fdb.Subspace(('D',))
|
||||
|
||||
EMPTY_OBJECT = -2
|
||||
EMPTY_ARRAY = -1
|
||||
|
||||
def to_tuples(item):
|
||||
if item == {}:
|
||||
return [(EMPTY_OBJECT, None)]
|
||||
elif item == []:
|
||||
return [(EMPTY_ARRAY, None)]
|
||||
elif type(item) == dict:
|
||||
return [(k,) + sub for k, v in item.iteritems() for sub in to_tuples(v)]
|
||||
elif type(item) == list:
|
||||
return [(k,) + sub for k, v in enumerate(item) for sub in to_tuples(v)]
|
||||
else:
|
||||
return [(item,)]
|
||||
|
||||
def from_tuples(tuples):
|
||||
if not tuples: return {}
|
||||
first = tuples[0] # Determine kind of object from first tuple
|
||||
if len(first) == 1: return first[0] # Primitive value
|
||||
if first == (EMPTY_OBJECT,None): return {}
|
||||
if first == (EMPTY_ARRAY, None): return []
|
||||
# For an object or array, we need to group the tuples by their first element
|
||||
groups = [list(g) for k, g in itertools.groupby(tuples, lambda t:t[0])]
|
||||
if first[0] == 0: # array
|
||||
return [from_tuples([t[1:] for t in g]) for g in groups]
|
||||
else: # object
|
||||
return dict((g[0][0], from_tuples([t[1:] for t in g])) for g in groups)
|
||||
|
||||
@fdb.transactional
|
||||
def insert_doc(tr, doc):
|
||||
if type(doc) == str:
|
||||
doc = json.loads(doc)
|
||||
if not 'doc_id' in doc:
|
||||
new_id = _get_new_id(tr)
|
||||
doc['doc_id'] = new_id
|
||||
for tup in to_tuples( doc ):
|
||||
tr[doc_space.pack((doc['doc_id'],) + tup[:-1])] = fdb.tuple.pack((tup[-1],))
|
||||
return doc['doc_id']
|
||||
|
||||
@fdb.transactional
|
||||
def _get_new_id(tr):
|
||||
found = False
|
||||
while (not found):
|
||||
new_id = random.randint(0, 100000000)
|
||||
found = True
|
||||
for _ in tr[doc_space[new_id].range()]:
|
||||
found = False
|
||||
break
|
||||
return new_id
|
||||
|
||||
@fdb.transactional
|
||||
def get_doc(tr, doc_id, prefix=()):
|
||||
v = tr[doc_space.pack((doc_id,) + prefix)]
|
||||
if v.present():
|
||||
return from_tuples([prefix + fdb.tuple.unpack(v)])
|
||||
else:
|
||||
return from_tuples([doc_space.unpack(k)[1:] + fdb.tuple.unpack(v)
|
||||
for k, v in tr[doc_space.range((doc_id,)+prefix)]])
|
Binary file not shown.
After Width: | Height: | Size: 18 KiB |
File diff suppressed because one or more lines are too long
Binary file not shown.
After Width: | Height: | Size: 27 KiB |
Binary file not shown.
After Width: | Height: | Size: 33 KiB |
Binary file not shown.
After Width: | Height: | Size: 29 KiB |
Binary file not shown.
After Width: | Height: | Size: 15 KiB |
Binary file not shown.
After Width: | Height: | Size: 44 KiB |
|
@ -0,0 +1,56 @@
|
|||
.. FoundationDB documentation master file
|
||||
|
||||
######################
|
||||
FoundationDB |version|
|
||||
######################
|
||||
|
||||
Overview
|
||||
========
|
||||
|
||||
FoundationDB is a distributed database designed to handle large volumes of structured data across clusters of commodity servers. It organizes data as an ordered key-value store and employs :doc:`ACID transactions <transaction-manifesto>` for all operations. It is especially well-suited for read/write workloads but also has excellent :doc:`performance <performance>` for write-intensive workloads. Users interact with the database using a :doc:`API language binding <api-reference>`.
|
||||
|
||||
Local Development
|
||||
=================
|
||||
|
||||
FoundationDB runs on clusters in Apple data centers, but you can :doc:`begin local development <local-dev>` now.
|
||||
|
||||
Documentation
|
||||
=============
|
||||
|
||||
FoundationDB is a robust choice for a broad range of use cases:
|
||||
|
||||
**Developers can store all types of data.** FoundationDB is multi-model, meaning you can store many types data in a single database. All data is safely stored, distributed, and replicated in FoundationDB.
|
||||
|
||||
**Administrators easily scale and handle hardware failures.** FoundationDB is easy to install, grow, and manage. It has a distributed architecture that gracefully scales out and handles faults while acting like a single ACID database.
|
||||
|
||||
**FoundationDB has industry-leading performance.** FoundationDB provides amazing performance on commodity hardware, allowing you to support very heavy loads at low cost.
|
||||
|
||||
**FoundationDB supports flexible application architectures.** Your application can talk directly to FoundationDB, to a layer, or both. Layers provide new capability on top of FoundationDB but are stateless.
|
||||
|
||||
The latest changes are detailed in :doc:`release-notes`. The documentation has the following sections:
|
||||
|
||||
* :doc:`why-foundationdb` describes the technical alternatives involved in NoSQL database design and explains the advantages of transaction processing at scale.
|
||||
|
||||
* :doc:`technical-overview` explains the engineering design of FoundationDB, with detailed information on its features, architecture, and performance.
|
||||
|
||||
* :doc:`client-design` contains documentation on getting started, data modeling, and design principles for building applications with FoundationDB.
|
||||
|
||||
* :doc:`design-recipes` give specific examples of how to build new data models, indexes, and more on top of the key-value store API.
|
||||
|
||||
* :doc:`api-reference` give a detailed description of the API for each language.
|
||||
|
||||
* :doc:`tutorials` provide simple examples of client design using FoundationDB.
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
:titlesonly:
|
||||
:hidden:
|
||||
|
||||
local-dev
|
||||
why-foundationdb
|
||||
technical-overview
|
||||
client-design
|
||||
design-recipes
|
||||
api-reference
|
||||
tutorials
|
||||
earlier-release-notes
|
|
@ -0,0 +1,118 @@
|
|||
|
||||
.. _known-limitations:
|
||||
|
||||
#################
|
||||
Known Limitations
|
||||
#################
|
||||
|
||||
.. include:: guide-common.rst.inc
|
||||
|
||||
FoundationDB has limitations, some of which may be addressed in future versions.
|
||||
|
||||
For related information, also see:
|
||||
* :doc:`platforms` that affect the operation of FoundationDB.
|
||||
* :ref:`system-requirements` for OS/hardware requirements.
|
||||
* :doc:`anti-features` for limitations of the scope of the FoundationDB core.
|
||||
* :ref:`developer-guide-peformance-considerations` for how different workloads can limit performance.
|
||||
|
||||
Design limitations
|
||||
==================
|
||||
|
||||
These limitations come from fundamental design decisions and are unlikely to change in the short term. Applications using FoundationDB should plan to work around these limitations. See :doc:`anti-features` for related discussion of our design approach to the FoundationDB core.
|
||||
|
||||
.. _long-transactions:
|
||||
|
||||
Long transactions
|
||||
-----------------
|
||||
|
||||
FoundationDB currently does not support transactions running for over five seconds. In particular, after 5 seconds from the first read in a transaction:
|
||||
|
||||
* subsequent reads that go to the database will usually raise a ``past_version`` :doc:`error <api-error-codes>` (although reads cached by the client will not);
|
||||
* a commit with any write will raise a ``past_version`` or ``not_committed`` :doc:`error <api-error-codes>`.
|
||||
|
||||
Clients need to avoid these cases. For the design reasons behind this limitation, see the discussion in :doc:`anti-features`.
|
||||
|
||||
.. admonition:: Workarounds
|
||||
|
||||
The effect of long and large transactions can be achieved using short and small transactions with a variety of techniques, depending on the desired behavior:
|
||||
|
||||
* If an application wants long transactions because of an external process in the loop, it can perform optimistic validation itself at a higher layer.
|
||||
* If it needs long-running read snapshots, it can perform versioning in a layer.
|
||||
* If it needs large bulk inserts, it can use a level of indirection to swap in the inserted data quickly.
|
||||
|
||||
As with all data modeling problems, please ask for help on the community site (or via e-mail) with your specific needs.
|
||||
|
||||
.. _large-transactions:
|
||||
|
||||
Large transactions
|
||||
------------------
|
||||
|
||||
Transaction size cannot exceed 10,000,000 bytes of affected data. Keys, values, and ranges that you read or write are all included as affected data. Likewise, conflict ranges that you :ref:`add <api-python-conflict-ranges>` or remove (using a :ref:`snapshot read <api-python-snapshot-reads>` or a :ref:`transaction option <api-python-no-write-conflict-range>`) are also added or removed from the scope of affected data.
|
||||
|
||||
If any single transaction exceeds one megabyte of affected data, you should modify your design. In the current version, these large transactions can cause performance issues and database availability can (briefly) be impacted.
|
||||
|
||||
.. admonition:: Workarounds
|
||||
|
||||
See the discussion in :ref:`long transactions <long-transactions>` for applicable workarounds.
|
||||
|
||||
.. _large-keys-and-values:
|
||||
|
||||
Large keys and values
|
||||
---------------------
|
||||
|
||||
Keys cannot exceed 10,000 bytes in size. Values cannot exceed 100,000 bytes in size. Errors will be raised by the client if these limits are exceeded.
|
||||
|
||||
.. admonition:: Workarounds
|
||||
|
||||
FoundationDB provides efficient ways to :doc:`design keys and values<largeval>` to work around this limitation.
|
||||
|
||||
.. _spinning-HDDs:
|
||||
|
||||
Spinning HDDs
|
||||
-------------
|
||||
|
||||
FoundationDB is only designed for good performance with rotational disk drives when using the durable :ref:`memory <configuration-storage-engine-memory>` storage engine. It is not recommended that you run FoundationDB on rotational HDDs when using the :ref:`ssd <configuration-storage-engine-ssd>` storage engine. Many algorithms and optimizations have been made to specifically target good performance on solid-state storage that do not translate well to HDDs. Reduced performance and/or database availability issues can be expected.
|
||||
|
||||
.. admonition:: Recommendation
|
||||
|
||||
Large disk arrays and abstracted storage subsystems with sufficient I/O performance may be able to overcome this limitation, but testing specific use cases will be required.
|
||||
|
||||
.. _dont-use-key-selectors-for-paging:
|
||||
|
||||
Key selectors with large offsets are slow
|
||||
-----------------------------------------
|
||||
|
||||
The current version of FoundationDB resolves key selectors with large offsets in O(offset) time. A common misusage of key selectors is using offsets to page through a large range of data (i.e. reading 'a'+0 to 'a'+100, then 'a'+100 to 'a'+200, etc.)
|
||||
|
||||
.. admonition:: Workarounds
|
||||
|
||||
An efficient alternative is to use the limit parameter with range reads, starting subsequent reads at the key after the last one returned. You can also use the iterator functionality available in most of the language bindings, which uses this technique internally.
|
||||
|
||||
The RankedSet layer provides a data structure in which large offsets and counting operations require only O(log N) time. It is a good choice for applications such as large leaderboards that require such functionality.
|
||||
|
||||
|
||||
Current limitations
|
||||
===================
|
||||
|
||||
These limitations do not reflect fundamental aspects of our design and are likely be resolved or mitigated in future versions. Administrators should be aware of these issues, but longer-term application development should be less driven by them.
|
||||
|
||||
Cluster size
|
||||
------------
|
||||
|
||||
FoundationDB has undergone performance testing and tuning with clusters of up to 500 cores/processes. Significantly larger clusters may experience performance bottlenecks leading to sub-linear scaling or related issues.
|
||||
|
||||
Database size
|
||||
-------------
|
||||
|
||||
FoundationDB has been tested with databases up to 100 TB (total size of key-value pairs -- required disk space will be significantly higher after replication and overhead).
|
||||
|
||||
Limited read load balancing
|
||||
---------------------------
|
||||
|
||||
FoundationDB load balances reads across the servers with replicas of the data being read. However, it does not currently increase the replication factor of keys that are frequently read. As a result, the aggregate read performance of any given key or small contiguous set of keys in a triple-replicated system is limited to the total performance of three server processes (typically on the order of 100,000 reads per second).
|
||||
|
||||
.. admonition:: Workarounds
|
||||
|
||||
If data is accessed exceptionally frequently, an application could avoid this limitation by storing such data in multiple subspaces, effectively increasing its replication factor.
|
||||
|
||||
|
|
@ -0,0 +1,27 @@
|
|||
###############################
|
||||
Managing Large Values and Blobs
|
||||
###############################
|
||||
|
||||
This tutorial illustrates techniques for storing and managing large values in FoundationDB. We'll look at using the blob (binary large object) layer, which provides a simple interface for storing unstructured data. We'll be drawing on :doc:`data-modeling` and :doc:`api-python`, so you should take a look at those documents if you're not familiar with them.
|
||||
|
||||
For an introductory tutorial that begins with "Hello world" and explains the basic concepts used in FoundationDB, take a look at our :doc:`class scheduling tutorial <class-scheduling>`.
|
||||
|
||||
Although we'll be using Python, the concepts in this tutorial are also applicable to the other :doc:`languages <api-reference>` supported by FoundationDB.
|
||||
|
||||
.. _largeval-modeling:
|
||||
|
||||
Modeling large values
|
||||
=====================
|
||||
|
||||
For key-value pairs stored in FoundationDB, values are limited to a size of 100 kB (see :ref:`Known Limitations<large-keys-and-values>`). Furthermore, you'll usually get the best performance by keeping value sizes below 10 kb, as discussed in our :ref:`performance guidelines<data-modeling-performance-guidelines>`.
|
||||
|
||||
.. _largeval-splitting:
|
||||
|
||||
Splitting structured values
|
||||
---------------------------
|
||||
|
||||
These factors lead to an obvious question: what should you do if your first cut at a data model results in values that are larger than those allowed by the above guidelines?
|
||||
|
||||
The answer depends on the nature and size of your values. If your values have some internal structure, consider revising your data model to split the values across multiple keys. For example, suppose you'd like to store a serialized JSON object. Instead of storing the object as the value of a single key, you could construct a key for each path in the object, as described for :ref:`documents <data-modeling-documents>`.
|
||||
|
||||
.. note:: In general, you should consider splitting your values if their sizes are above 10kb, or if they are above 1kb and you only use a part of each value after reading it.
|
|
@ -0,0 +1,21 @@
|
|||
#############
|
||||
Layer Concept
|
||||
#############
|
||||
|
||||
When we started building FoundationDB, instead of thinking about all the features that it could have, we asked ourselves *what features could we take away?* Almost everything, we decided. We simplified the core, allowing us to focus on making it as strong as possible, and built additional features as layers. Here's how the approach works:
|
||||
|
||||
The old way
|
||||
===========
|
||||
|
||||
When you choose a database today, you're not choosing one piece of technology, you're choosing three: storage technology, data model, and API/query language. For example, if you choose Postgres, you are choosing the Postgres storage engine, a relational data model, and the SQL query language. If you choose MongoDB you are choosing the MongoDB distributed storage engine, a document data model, and the MongoDB API. In systems like these, features are interwoven between all of the layers. For example, both of those systems provide indexes, and the notion of an index exists in all three layers.
|
||||
|
||||
Document databases, column-oriented, row-oriented, JSON, key-value, etc. all make sense in the right context, and often different parts of an application call for different choices. This creates a tough decision: Use a whole new database to support a new data model, or try to shoehorn data into your existing database.
|
||||
|
||||
The FoundationDB way
|
||||
====================
|
||||
|
||||
FoundationDB decouples its data storage technology from its data model. FoundationDB's core ordered key-value storage technology can be efficiently adapted and remapped to a broad array of rich data models. Using indexing as an example, FoundationDB's core provides no indexing and never will. Instead, a layer provides indexing by storing two kinds of key-values, one for the data and one for the index.
|
||||
|
||||
For example, the ``people/alice/eye_color = blue`` key-value stores data about Alice's eye color and the ``eye_color/blue/alice = true`` key-value stores an index of people by eye color. Now, finding all people with blue eyes is as simple as finding all keys that start with ``eye_color/blue/``. Since FoundationDB's core keeps all keys in order and all those keys share a common prefix, the operation can be efficiently implemented with a single range-read operation.
|
||||
|
||||
Of course, any ordered key-value database could use this approach. The real magic comes when you mix in true ACID transactions. This allows the indexing layer to update both the data and the index in a single transaction, ensuring their consistency. The importance of this guarantee can't be overstated. Transactions allow layers to be built simply, reliably, and efficiently.
|
|
@ -0,0 +1,29 @@
|
|||
#################
|
||||
Local Development
|
||||
#################
|
||||
|
||||
Download the FoundationDB package
|
||||
=================================
|
||||
|
||||
:doc:`Download the FoundationDB package <downloads>` for macOS (FoundationDB-*.pkg) onto your local development machine.
|
||||
|
||||
Install the FoundationDB binaries
|
||||
=================================
|
||||
|
||||
By default, the FoundationDB installer installs the binaries required to run both a client and a local development server. Begin installation by double-clicking on the downloaded FoundationDB package and following the displayed instructions.
|
||||
|
||||
For more details on installing on macOS, see :doc:`getting-started-mac`.
|
||||
|
||||
If you later wish to remove FoundationDB from a machine, follow the instruction for :ref:`uninstalling <administration-removing>`.
|
||||
|
||||
Check the status of the local database
|
||||
======================================
|
||||
|
||||
You can verify the status of the local database with the following the command, which will show basic statistics::
|
||||
|
||||
fdbcli --exec status
|
||||
|
||||
Basic tutorial
|
||||
==============
|
||||
|
||||
Here's a :doc:`tutorial <class-scheduling>` that begins with "Hello world" code for connecting to the database and then walks through the basics of reading and writing data with transactions.
|
|
@ -0,0 +1,45 @@
|
|||
################################
|
||||
Moving a Cluster to New Machines
|
||||
################################
|
||||
|
||||
Goal
|
||||
====
|
||||
|
||||
Move a FoundationDB cluster to new machines.
|
||||
|
||||
Challenge
|
||||
=========
|
||||
|
||||
You need to move an existing cluster to new machines, and you don't want to risk data loss or downtime during the process.
|
||||
|
||||
Explanation
|
||||
===========
|
||||
|
||||
Use basic administrative commands to add new machines to your cluster, migrate the data, and remove the old machines.
|
||||
Recipe
|
||||
|
||||
To move your cluster to new machines, perform the following steps:
|
||||
|
||||
1. Provision your new machines.
|
||||
|
||||
2. Install the FoundationDB packages on each of the new machines.
|
||||
|
||||
3. Stop the FoundationDB service on the new machines:
|
||||
|
||||
$ sudo service foundationdb stop
|
||||
|
||||
4. Copy the fdb.cluster file from any one of the old machines to each of the new machines. If you need to modify the default configuration (e.g. changing the data storage location), you should do so now.
|
||||
|
||||
5. Restart the FoundationDB service on each new machine::
|
||||
|
||||
$ sudo service foundationdb start
|
||||
|
||||
6. Start ``fdbcli`` and run ``status details``. This should show a number of processes equal to both old and new machines, with processes on the original machines serving as the cluster coordinators.
|
||||
|
||||
7. Exclude the original machines from the cluster using ``exclude`` in ``fdbcli``. This command will not return until all database state has been moved off of the original machines and fully replicated to the new machines. For example::
|
||||
|
||||
fdb> exclude 192.168.1.1:4500 192.168.1.2:4500 192.168.1.3:4500
|
||||
|
||||
8. Run ``coordinators auto`` in ``fdbcli`` to move coordination state to the new machines. Please note that this will cause the fdb.cluster file to be updated with the addresses of the new machines. Any currently connected clients will be notified and (assuming they have appropriate file system permissions) will update their own copy of the cluster file. As long as the original machines are still running, any clients that connect to them will be automatically forwarded to the new cluster coordinators. However, if you have a client that has not yet connected or only connects intermittently, you will need to copy the new cluster file from one of the new machines to the client machine.
|
||||
|
||||
9. The ``status details`` command in the fdbcli will now show only the new processes (both as workers and coordinators), and you can safely shut down the older machines.
|
|
@ -0,0 +1,381 @@
|
|||
.. default-domain:: py
|
||||
.. highlight:: python
|
||||
|
||||
.. _mr-status:
|
||||
|
||||
#######################
|
||||
Machine-Readable Status
|
||||
#######################
|
||||
|
||||
.. include:: guide-common.rst.inc
|
||||
|
||||
FoundationDB provides status information in machine-readable JSON form (in addition to the human-readable form made available by :ref:`the command line interface <cli-status>`). This document explains how to access the machine-readable status, provides guidance for its use, and describes the JSON format used to encode it.
|
||||
|
||||
.. _mr-status-key:
|
||||
|
||||
Accessing machine-readable status
|
||||
=================================
|
||||
|
||||
You can access machine-readable status in three ways ways:
|
||||
|
||||
* Within ``fdbcli``, issue the command ``status json``. This command will output status information in JSON (rather than the human-readable format output by ``status`` and ``status details``). See the :ref:`cli-status` command for more information.
|
||||
* From a command shell, use fdbcli by running ``fdbcli --exec "status json"``
|
||||
* From any client, read the key ``\xFF\xFF/status/json``. The value of this key is a JSON object serialized to a byte string with UTF-8 encoding. In Python, given an open database ``db``, the JSON object can be read and deserialized with::
|
||||
|
||||
import json
|
||||
status = json.loads(db['\xff\xff/status/json'])
|
||||
|
||||
Guidance regarding versioning
|
||||
=============================
|
||||
|
||||
The JSON format of the machine-readable status is not considered part of our API and, in particular, is not governed by the :ref:`versioning mechanism <api-python-versioning>` used to facilitate API upgrades. A client that makes use of the machine-readable status should be prepared to handle possible format changes across versions.
|
||||
|
||||
Format changes will be governed as follows:
|
||||
|
||||
* We will not make arbitrary changes to the JSON format; we will make such changes only as required by changes in the underlying system characteristics relevant to status reporting.
|
||||
* We may add fields as needed to report new categories of data.
|
||||
* We may remove a field if a new version of FoundationDB renders the field obsolete.
|
||||
* We will *not* change the semantics of an existing field. If the data relating to a field changes in a manner that is incompatible with previous usage, the field will be deleted and replaced by a newly named field.
|
||||
|
||||
JSON format
|
||||
===========
|
||||
|
||||
The following format informally describes the JSON containing the status data. The possible values of ``<name_string>`` and ``<description_string>`` are described in :ref:`mr-status-message`. The format is representative: *any field can be missing at any time*, depending on the database state. Clients should be prepared to flexibly handle format variations resulting from different database states.
|
||||
|
||||
.. code-block:: javascript
|
||||
|
||||
{
|
||||
"client": {
|
||||
"cluster_file": {
|
||||
"path": "/etc/foundationdb/fdb.cluster",
|
||||
"up_to_date": true
|
||||
},
|
||||
"coordinators": {
|
||||
"coordinators": [
|
||||
{
|
||||
"address": "10.0.4.1:4701",
|
||||
"reachable": true
|
||||
}
|
||||
],
|
||||
"quorum_reachable": true
|
||||
},
|
||||
"database_status": {
|
||||
"available": true,
|
||||
"healthy": true
|
||||
},
|
||||
"messages": [
|
||||
{
|
||||
"name": <name_string>,
|
||||
"description": <description_string>
|
||||
}
|
||||
],
|
||||
"timestamp": 1415650089
|
||||
},
|
||||
"cluster": {
|
||||
"clients": {
|
||||
"count": 1,
|
||||
"supported_versions": [
|
||||
{
|
||||
"client_version": "4.2.0",
|
||||
"connected_clients": [
|
||||
{
|
||||
"address": "127.0.0.1:1234",
|
||||
"log_group": "default"
|
||||
}
|
||||
],
|
||||
"count": 1,
|
||||
"protocol_version": "fdb00a400050001",
|
||||
"source_version": "a21e22025bafd7da5e642182683d450e7b68ca26"
|
||||
}
|
||||
]
|
||||
},
|
||||
"cluster_controller_timestamp": 1415650089,
|
||||
"configuration": {
|
||||
"coordinators_count": 1,
|
||||
"excluded_servers": [
|
||||
{"address": "10.0.4.1"}
|
||||
],
|
||||
"logs": 2, // this field will be absent if a value has not been explicitly set
|
||||
"policy": "zoneid^3 x 1",
|
||||
"proxies": 5, // this field will be absent if a value has not been explicitly set
|
||||
"redundancy": {
|
||||
"factor": < "single"
|
||||
| "double"
|
||||
| "triple"
|
||||
| "custom"
|
||||
| "two_datacenter"
|
||||
| "three_datacenter"
|
||||
| "three_data_hall"
|
||||
>
|
||||
},
|
||||
"resolvers": 1, // this field will be absent if a value has not been explicitly set
|
||||
"storage_engine": < "ssd"
|
||||
| "memory"
|
||||
| "custom"
|
||||
>
|
||||
},
|
||||
"data": {
|
||||
"average_partition_size_bytes": 0,
|
||||
"least_operating_space_bytes_log_server": 0,
|
||||
"least_operating_space_bytes_storage_server": 0,
|
||||
"moving_data": {
|
||||
"in_flight_bytes": 0,
|
||||
"in_queue_bytes": 0
|
||||
},
|
||||
"partitions_count": 2,
|
||||
"state": {
|
||||
"name": < "initializing"
|
||||
| "missing_data"
|
||||
| "healing"
|
||||
| "healthy_repartitioning"
|
||||
| "healthy_removing_server"
|
||||
| "healthy_rebalancing"
|
||||
| "healthy"
|
||||
>,
|
||||
"description": <string>,
|
||||
"healthy": true,
|
||||
"min_replicas_remaining": 0
|
||||
},
|
||||
"total_disk_used_bytes": 0,
|
||||
"total_kv_size_bytes": 0 // estimated
|
||||
},
|
||||
"database_available": true,
|
||||
"database_locked": false,
|
||||
"fault_tolerance": {
|
||||
"max_machine_failures_without_losing_availability": 0,
|
||||
"max_machine_failures_without_losing_data": 0
|
||||
},
|
||||
"latency_probe": { // all measurements are based on running sample transactions
|
||||
"commit_seconds": 0.0, // time to commit a sample transaction
|
||||
"read_seconds": 0.0, // time to perform a single read
|
||||
"transaction_start_seconds": 0.0, // time to start a sample transaction at normal priority
|
||||
"immediate_priority_transaction_start_seconds":0.0, // time to start a sample transaction at system immediate priority
|
||||
"batch_priority_transaction_start_seconds":0.0 // time to start a sample transaction at batch priority
|
||||
},
|
||||
"machines": {
|
||||
<id_string>: {
|
||||
"address": "10.0.4.1",
|
||||
"cpu": {
|
||||
"logical_core_utilization": 0.0 // computed as cpu_seconds / elapsed_seconds; value may be capped at 0.5 due to hyper-threading
|
||||
},
|
||||
"datacenter_id": <id_string>,
|
||||
"excluded": false,
|
||||
"locality": { // This will contain any locality fields that are provided on the command line
|
||||
"machineid": <id_string>,
|
||||
"dcid": <id_string>
|
||||
},
|
||||
"machine_id": <id_string>,
|
||||
"memory": {
|
||||
"committed_bytes": 0,
|
||||
"free_bytes": 0, // an estimate of how many bytes are free to allocate to fdbservers without swapping
|
||||
"total_bytes": 0 // an estimate of total physical RAM
|
||||
},
|
||||
"network": {
|
||||
"megabits_received": {"hz": 0.0},
|
||||
"megabits_sent": {"hz": 0.0},
|
||||
"tcp_segments_retransmitted": {"hz": 0.0}
|
||||
}
|
||||
}
|
||||
},
|
||||
"messages": [
|
||||
{
|
||||
"name": <name_string>,
|
||||
"description": <description_string>,
|
||||
"issues": [
|
||||
{
|
||||
"name": < "incorrect_cluster_file_contents"
|
||||
| ...
|
||||
>,
|
||||
"description": "Cluster file contents do not match current cluster connection string. Verify cluster file is writable and has not been overwritten externally."
|
||||
}
|
||||
],
|
||||
"reasons": [
|
||||
{"description": <string>}
|
||||
],
|
||||
"unreachable_processes": [
|
||||
{"address": "10.0.4.1:4702"}
|
||||
]
|
||||
}
|
||||
],
|
||||
"processes": {
|
||||
<id_string>: {
|
||||
"address": "10.0.4.1:4701",
|
||||
"uptime_seconds": 1234.2345,
|
||||
"command_line": <string>,
|
||||
"cpu": {
|
||||
"usage_cores": 0.0 // average number of logical cores utilized by the process over the recent past; value may be > 1.0
|
||||
},
|
||||
"disk": {
|
||||
"busy": 0.0 // from 0.0 (idle) to 1.0 (fully busy)
|
||||
},
|
||||
"excluded": false,
|
||||
"machine_id": <id_string>,
|
||||
"fault_domain": <id_string>,
|
||||
"locality": { // This will contain any locality fields that are provided on the command line
|
||||
"machineid": <id_string>,
|
||||
"dcid": <id_string>
|
||||
},
|
||||
"memory": {
|
||||
"available_bytes": 0, //an estimate of the process' fair share of the memory available to fdbservers
|
||||
"limit_bytes": 0, // memory limit per process
|
||||
"used_bytes": 0
|
||||
},
|
||||
"messages": [
|
||||
{
|
||||
"name": <name_string>,
|
||||
"description": <description_string>,
|
||||
"raw_log_message": <string>,
|
||||
"time": 0.0,
|
||||
"type": <string>
|
||||
}
|
||||
],
|
||||
"network": {
|
||||
"current_connections":0,
|
||||
"connections_established": {"hz": 0.0},
|
||||
"connections_closed": {"hz": 0.0},
|
||||
"connection_errors": {"hz": 0.0},
|
||||
"megabits_received": {"hz": 0.0},
|
||||
"megabits_sent": {"hz": 0.0}
|
||||
},
|
||||
"roles": [
|
||||
{
|
||||
"id": <id_string>,
|
||||
"role": < "master"
|
||||
| "proxy"
|
||||
| "log"
|
||||
| "storage"
|
||||
| "resolver"
|
||||
| "cluster_controller"
|
||||
>
|
||||
}
|
||||
],
|
||||
"version": "3.0.0" // a process version will not be reported if it is not protocol-compatible; it will be absent from status
|
||||
}
|
||||
},
|
||||
"qos": {
|
||||
"limiting_queue_bytes_storage_server": 0,
|
||||
"limiting_version_lag_storage_server": 0,
|
||||
"performance_limited_by": {
|
||||
"name": <name_string>, // "workload" when not limiting
|
||||
"description": <description_string>,
|
||||
"reason_id": 0,
|
||||
"reason_server_id": <id_string>
|
||||
},
|
||||
"released_transactions_per_second": 0.0,
|
||||
"transactions_per_second_limit": 0.0,
|
||||
"worst_queue_bytes_log_server": 460,
|
||||
"worst_queue_bytes_storage_server": 0,
|
||||
"worst_version_lag_storage_server": 0
|
||||
},
|
||||
"recovery_state": {
|
||||
"name": <name_string>, // "fully_recovered" is the healthy state; other states are normal to transition through but not to persist in
|
||||
"description": <description_string>,
|
||||
"required_logs": 3,
|
||||
"required_proxies": 1,
|
||||
"required_resolvers": 1
|
||||
},
|
||||
"workload": {
|
||||
// A given counter can be reset.
|
||||
// Roughness is a measure of the "bunching" of operations (independent of hz). Perfectly
|
||||
// spaced operations will have a roughness of 1.0 . Randomly spaced (Poisson-distributed)
|
||||
// operations will have a roughness of 2.0, with increased bunching resulting in increased
|
||||
// values. Higher roughness can result in increased latency due to increased queuing.
|
||||
"bytes": {
|
||||
"written": {"counter": 0, "hz": 0.0, "roughness": 0.0}
|
||||
},
|
||||
"operations": {
|
||||
"reads": {"hz": 0.0},
|
||||
"writes": {"counter": 0, "hz": 0.0, "roughness": 0.0}
|
||||
},
|
||||
"transactions": {
|
||||
"committed": {"counter": 0, "hz": 0.0, "roughness": 0.0},
|
||||
"conflicted": {"counter": 0, "hz": 0.0, "roughness": 0.0},
|
||||
"started": {"counter": 0, "hz": 0.0, "roughness": 0.0}
|
||||
}
|
||||
},
|
||||
"layers": {
|
||||
}
|
||||
}
|
||||
}
|
||||
.. _mr-status-message:
|
||||
|
||||
Message components
|
||||
------------------
|
||||
|
||||
Several fields in the JSON object may contain messages in the format:
|
||||
|
||||
.. code-block:: javascript
|
||||
|
||||
"messages": [
|
||||
{
|
||||
"name": <name_string>,
|
||||
"description": <description_string>
|
||||
}
|
||||
]
|
||||
|
||||
Each message is an Object having at least a ``"name"`` field. The ``"description"`` is present only in some messages. Other fields may be present based on specific message instance details. The possible name and description values of a message found at a given location in the JSON object are described in the tables below.
|
||||
|
||||
==================================== =============================== =================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================
|
||||
JSON Path Name Description
|
||||
==================================== =============================== =================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================
|
||||
client.messages inconsistent_cluster_file Cluster file is not up to date. It contains the connection string ‘<value>’. The current connection string is ‘<value>’. This must mean that file permissions or other platform issues have prevented the file from being updated. To change coordinators without manual intervention, the cluster file and its containing folder must be writable by all servers and clients. If a majority of the coordinators referenced by the old connection string are lost, the database will stop working until the correct cluster file is distributed to all processes.
|
||||
client.messages no_cluster_controller Unable to locate a cluster controller within 2 seconds. Check that there are server processes running.
|
||||
client.messages quorum_not_reachable Unable to reach a quorum of coordinators.
|
||||
client.messages status_incomplete_client Could not retrieve client status information.
|
||||
client.messages status_incomplete_cluster Could not retrieve cluster status information.
|
||||
client.messages status_incomplete_coordinators Could not fetch coordinator info.
|
||||
client.messages status_incomplete_error Cluster encountered an error fetching status.
|
||||
client.messages status_incomplete_timeout Timed out fetching cluster status.
|
||||
client.messages unreachable_cluster_controller No response received from the cluster controller.
|
||||
cluster.messages client_issues Some clients of this cluster have issues.
|
||||
cluster.messages commit_timeout Unable to commit after __ seconds.
|
||||
cluster.messages read_timeout Unable to read after __ seconds.
|
||||
cluster.messages status_incomplete Unable to retrieve all status information.
|
||||
cluster.messages storage_servers_error Timed out trying to retrieve storage servers.
|
||||
cluster.messages log_servers_error Timed out trying to retrieve log servers.
|
||||
cluster.messages transaction_start_timeout Unable to start transaction after __ seconds.
|
||||
cluster.messages unreachable_master_worker Unable to locate the master worker.
|
||||
cluster.messages unreachable_processes The cluster has some unreachable processes.
|
||||
cluster.messages unreadable_configuration Unable to read database configuration.
|
||||
cluster.messages layer_status_incomplete Some or all of the layers subdocument could not be read.
|
||||
cluster.processes.<process>.messages file_open_error Unable to open ‘<file>’ (<os_error>).
|
||||
cluster.processes.<process>.messages incorrect_cluster_file_contents Cluster file contents do not match current cluster connection string. Verify cluster file is writable and has not been overwritten externally.
|
||||
cluster.processes.<process>.messages io_error <error> occured in <subsystem>
|
||||
cluster.processes.<process>.messages platform_error <error> occured in <subsystem>
|
||||
cluster.processes.<process>.messages process_error <error> occured in <subsystem>
|
||||
==================================== =============================== =================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================
|
||||
|
||||
The JSON path ``cluster.recovery_state``, when it exists, is an Object containing at least ``"name"`` and ``"description"``. The possible values for those fields are in the following table:
|
||||
|
||||
================================ =========================================================================================================================================================================================
|
||||
Name Description
|
||||
================================ =========================================================================================================================================================================================
|
||||
reading_coordinated_state Requesting information from coordination servers. Verify that a majority of coordination server processes are active.
|
||||
locking_coordinated_state Locking coordination state. Verify that a majority of coordination server processes are active.
|
||||
reading_transaction_system_state Recovering transaction server state. Verify that the transaction server processes are active.
|
||||
configuration_missing There appears to be a database, but its configuration does not appear to be initialized.
|
||||
configuration_never_created The coordinator(s) have no record of this database. Either the coordinator addresses are incorrect, the coordination state on those machines is missing, or no database has been created.
|
||||
configuration_invalid The database configuration is invalid. Set a new, valid configuration to recover the database.
|
||||
recruiting_transaction_servers Recruiting new transaction servers.
|
||||
initializing_transaction_servers Initializing new transaction servers and recovering transaction logs.
|
||||
recovery_transaction Performing recovery transaction.
|
||||
writing_coordinated_state Writing coordinated state. Verify that a majority of coordination server processes are active.
|
||||
fully_recovered Recovery complete.
|
||||
================================ =========================================================================================================================================================================================
|
||||
|
||||
The JSON path ``cluster.qos.performance_limited_by``, when it exists, is an Object containing at least ``"name"`` and ``"description"``. The possible values for those fields are in the following table:
|
||||
|
||||
=================================== ====================================================
|
||||
Name Description
|
||||
=================================== ====================================================
|
||||
workload The database is not being saturated by the workload.
|
||||
storage_server_write_queue_size Storage server performance (storage queue).
|
||||
storage_server_write_bandwidth_mvcc Storage server MVCC memory.
|
||||
storage_server_readable_behind Storage server version falling behind.
|
||||
log_server_mvcc_write_bandwidth Log server MVCC memory.
|
||||
log_server_write_queue Storage server performance (log queue).
|
||||
min_free_space Running out of space (approaching 100MB limit).
|
||||
min_free_space_ratio Running out of space (approaching 5% limit).
|
||||
log_server_min_free_space Log server running out of space (approaching 100MB limit).
|
||||
log_server_min_free_space_ratio Log server running out of space (approaching 5% limit).
|
||||
=================================== ====================================================
|
|
@ -0,0 +1,167 @@
|
|||
#########
|
||||
Multimaps
|
||||
#########
|
||||
|
||||
:doc:`Python <multimaps>` **Java**
|
||||
|
||||
Goal
|
||||
====
|
||||
|
||||
Create an `multimap <http://en.wikipedia.org/wiki/Multimap>`_ data structure with `multiset <http://en.wikipedia.org/wiki/Multiset>`_ values.
|
||||
|
||||
Challenge
|
||||
=========
|
||||
|
||||
Support efficient operations on multimaps, including random addition, removal, and retrieval of indexed values.
|
||||
|
||||
Explanation
|
||||
===========
|
||||
|
||||
Multimaps are a generalization of dictionaries (a.k.a. maps or associative arrays) in which each index can have multiple values. Multimaps can be further generalized by allowing a value to be present more than once for a given index, so that each index is associated with a multiset. These structures have a simple and efficient representation as FoundationDB's key-value pairs.
|
||||
|
||||
Ordering
|
||||
========
|
||||
|
||||
We store all values of a given index using adjacent key-value pairs. This allows all values of an index to be retrieved with a single range read.
|
||||
|
||||
Pattern
|
||||
=======
|
||||
|
||||
We store all values in the multimap within a subspace, which takes care of packing our keys into byte strings.
|
||||
|
||||
.. code-block:: java
|
||||
|
||||
static {
|
||||
multi = new Subspace(Tuple.from("M"));
|
||||
}
|
||||
|
||||
Because we need to store multiple values per index, we'll store them within keys, with each (index, value) pair in its own key. To implement the multiset we’ll record the number of occurrences of each value. This is done by storing a positive integer with the key using an atomic addition. Each addition of a given value for an index will increment the count by 1:
|
||||
|
||||
.. code-block:: java
|
||||
|
||||
ByteBuffer b = ByteBuffer.allocate(8);
|
||||
b.order(ByteOrder.LITTLE_ENDIAN);
|
||||
b.putLong(1l);
|
||||
tr.mutate(MutationType.ADD, key, b.array());
|
||||
|
||||
By using a read-free atomic addition, FoundationDB guarantees that the addition operation will not conflict. As a result, values can be frequently added by multiple clients.
|
||||
|
||||
Subtracting values, on the other hand, requires a read to ensure that the value count does not fall below 0. (Hence, unlike additions, subtractions will be subject to conflicts.) We'll just delete the key if a subtraction reduces the count to 0 in order to keep the representation sparse.
|
||||
|
||||
Extensions
|
||||
==========
|
||||
|
||||
*Negative value counts*
|
||||
|
||||
We can generalize the representation further by allowing the count to be an arbitrary integer rather than restricting it to a positive integer. This extension may be useful for applications that record a deficit or debt of some resource. In this case, the code becomes even simpler and more efficient: we can simply remove the read and test from subtraction, making it conflict-free like the addition operation.
|
||||
|
||||
Code
|
||||
====
|
||||
|
||||
Here’s a simple implementation of multimaps with multisets as described:
|
||||
|
||||
.. code-block:: java
|
||||
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.ByteOrder;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map.Entry;
|
||||
public class MicroMulti {
|
||||
private static final FDB fdb;
|
||||
private static final Database db;
|
||||
private static final Subspace multi;
|
||||
private static final int N = 100;
|
||||
|
||||
static {
|
||||
fdb = FDB.selectAPIVersion(510);
|
||||
db = fdb.open();
|
||||
multi = new Subspace(Tuple.from("M"));
|
||||
}
|
||||
|
||||
private static void addHelp(TransactionContext tcx, final byte[] key, final long amount){
|
||||
tcx.run(new Function<Transaction,Void>() {
|
||||
public Void apply(Transaction tr){
|
||||
ByteBuffer b = ByteBuffer.allocate(8);
|
||||
b.order(ByteOrder.LITTLE_ENDIAN);
|
||||
b.putLong(amount);
|
||||
|
||||
tr.mutate(MutationType.ADD, key, b.array());
|
||||
|
||||
return null;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
private static long getLong(byte[] val){
|
||||
ByteBuffer b = ByteBuffer.allocate(8);
|
||||
b.order(ByteOrder.LITTLE_ENDIAN);
|
||||
b.put(val);
|
||||
return b.getLong(0);
|
||||
}
|
||||
|
||||
public static void add(TransactionContext tcx, final String index,
|
||||
final Object value){
|
||||
tcx.run(new Function<Transaction,Void>() {
|
||||
public Void apply(Transaction tr){
|
||||
addHelp(tr, multi.subspace(Tuple.from(index,value)).getKey(),1l);
|
||||
return null;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
public static void subtract(TransactionContext tcx, final String index,
|
||||
final Object value){
|
||||
tcx.run(new Function<Transaction,Void>() {
|
||||
public Void apply(Transaction tr){
|
||||
Future<byte[]> v = tr.get(multi.subspace(
|
||||
Tuple.from(index,value)).getKey());
|
||||
|
||||
if(v.get() != null && getLong(v.get()) > 1l){
|
||||
addHelp(tr, multi.subspace(Tuple.from(index,value)).getKey(), -1l);
|
||||
} else {
|
||||
tr.clear(multi.subspace(Tuple.from(index,value)).getKey());
|
||||
}
|
||||
return null;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
public static ArrayList<Object> get(TransactionContext tcx, final String index){
|
||||
return tcx.run(new Function<Transaction,ArrayList<Object> >() {
|
||||
public ArrayList<Object> apply(Transaction tr){
|
||||
ArrayList<Object> vals = new ArrayList<Object>();
|
||||
for(KeyValue kv : tr.getRange(multi.subspace(
|
||||
Tuple.from(index)).range())){
|
||||
vals.add(multi.unpack(kv.getKey()).get(1));
|
||||
}
|
||||
return vals;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
public static HashMap<Object,Long> getCounts(TransactionContext tcx,
|
||||
final String index){
|
||||
return tcx.run(new Function<Transaction,HashMap<Object,Long> >() {
|
||||
public HashMap<Object,Long> apply(Transaction tr){
|
||||
HashMap<Object,Long> vals = new HashMap<Object,Long>();
|
||||
for(KeyValue kv : tr.getRange(multi.subspace(
|
||||
Tuple.from(index)).range())){
|
||||
vals.put(multi.unpack(kv.getKey()).get(1),
|
||||
getLong(kv.getValue()));
|
||||
}
|
||||
return vals;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
public static boolean isElement(TransactionContext tcx, final String index,
|
||||
final Object value){
|
||||
return tcx.run(new Function<Transaction,Boolean>() {
|
||||
public Boolean apply(Transaction tr){
|
||||
return tr.get(multi.subspace(
|
||||
Tuple.from(index, value)).getKey()).get() != null;
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
|
@ -0,0 +1,84 @@
|
|||
#########
|
||||
Multimaps
|
||||
#########
|
||||
|
||||
**Python** :doc:`Java <multimaps-java>`
|
||||
|
||||
Goal
|
||||
====
|
||||
|
||||
Create an `multimap <http://en.wikipedia.org/wiki/Multimap>`_ data structure with `multiset <http://en.wikipedia.org/wiki/Multiset>`_ values.
|
||||
|
||||
Challenge
|
||||
=========
|
||||
|
||||
Support efficient operations on multimaps, including random addition, removal, and retrieval of indexed values.
|
||||
|
||||
Explanation
|
||||
===========
|
||||
|
||||
Multimaps are a generalization of dictionaries (a.k.a. maps or associative arrays) in which each index can have multiple values. Multimaps can be further generalized by allowing a value to be present more than once for a given index, so that each index is associated with a multiset. These structures have a simple and efficient representation as FoundationDB's key-value pairs.
|
||||
|
||||
Ordering
|
||||
========
|
||||
|
||||
We store all values of a given index using adjacent key-value pairs. This allows all values of an index to be retrieved with a single range read.
|
||||
|
||||
Pattern
|
||||
=======
|
||||
|
||||
We store all values in the multimap within a subspace, which takes care of packing our keys into byte strings.
|
||||
::
|
||||
|
||||
multi = fdb.Subspace(('M',))
|
||||
|
||||
Because we need to store multiple values per index, we'll store them within keys, with each (index, value) pair in its own key. To implement the multiset we’ll record the number of occurrences of each value. This is done by storing a positive integer with the key using an atomic addition. Each addition of a given value for an index will increment the count by 1:
|
||||
::
|
||||
|
||||
tr.add(multi[index][value], struct.pack('<q', 1))
|
||||
|
||||
By using a read-free atomic addition, FoundationDB guarantees that the addition operation will not conflict. As a result, values can be frequently added by multiple clients.
|
||||
|
||||
Subtracting values, on the other hand, requires a read to ensure that the value count does not fall below 0. (Hence, unlike additions, subtractions will be subject to conflicts.) We'll just delete the key if a subtraction reduces the count to 0 in order to keep the representation sparse.
|
||||
|
||||
Extensions
|
||||
==========
|
||||
|
||||
*Negative value counts*
|
||||
|
||||
We can generalize the representation further by allowing the count to be an arbitrary integer rather than restricting it to a positive integer. This extension may be useful for applications that record a deficit or debt of some resource. In this case, the code becomes even simpler and more efficient: we can simply remove the read and test from subtraction, making it conflict-free like the addition operation.
|
||||
|
||||
Code
|
||||
====
|
||||
|
||||
Here’s a simple implementation of multimaps with multisets as described::
|
||||
|
||||
import struct
|
||||
|
||||
multi = fdb.Subspace(('M',))
|
||||
|
||||
# Multimaps with multiset values
|
||||
@fdb.transactional
|
||||
def multi_add(tr, index, value):
|
||||
tr.add(multi[index][value], struct.pack('<q', 1))
|
||||
|
||||
@fdb.transactional
|
||||
def multi_subtract(tr, index, value):
|
||||
v = tr[multi[index][value]]
|
||||
if v.present() and struct.unpack('<q', str(v))[0] > 1:
|
||||
tr.add(multi[index][value], struct.pack('<q', -1))
|
||||
else:
|
||||
del tr[multi[index][value]]
|
||||
|
||||
@fdb.transactional
|
||||
def multi_get(tr, index):
|
||||
return [multi.unpack(k)[1] for k, v in tr[multi[index].range()]]
|
||||
|
||||
@fdb.transactional
|
||||
def multi_get_counts(tr, index):
|
||||
return {multi.unpack(k)[1]:struct.unpack('<q', v)[0]
|
||||
for k, v in tr[multi[index].range()]}
|
||||
|
||||
@fdb.transactional
|
||||
def multi_is_element(tr, index, value):
|
||||
return tr[multi[index][value]].present()
|
|
@ -0,0 +1,77 @@
|
|||
#######################
|
||||
Release Notes (Alpha 5)
|
||||
#######################
|
||||
|
||||
FoundationDB Alpha 5
|
||||
====================
|
||||
|
||||
Language support
|
||||
-------------------------
|
||||
|
||||
* FoundationDB now supports :doc:`Ruby </api-ruby>`
|
||||
|
||||
* FoundationDB now supports :doc:`Node.js </api-node>`
|
||||
|
||||
* FoundationDB now supports `Java </javadoc/index.html>`_ and other JVM languages.
|
||||
|
||||
.. _alpha-5-rel-notes-features:
|
||||
|
||||
Features
|
||||
------------
|
||||
* A new :doc:`backup </backups>` system allows scheduled backups of a snapshot of the FoundationDB database to an external filesystem.
|
||||
|
||||
* :doc:`Integrated HTML documentation </index>`
|
||||
|
||||
* :ref:`Snapshot reads <snapshot isolation>` allow API clients to selectively relax FoundationDB's strong isolation guarantee. Appropriate use can of them can reduce :ref:`conflict-ranges` but makes reasoning about concurrency harder.
|
||||
|
||||
* :ref:`Streaming modes <streaming-mode-python>` allow API clients to adjust how FoundationDB transfers data for range reads for improved performance.
|
||||
|
||||
* Client APIs automatically detect the appropriate network interface (local address) when connecting to a cluster, and will look for a :ref:`default-cluster-file`.
|
||||
|
||||
Compatibility
|
||||
-------------
|
||||
|
||||
* Tuples encoded with prior alpha versions are incompatible with the tuple layer in Alpha 5.
|
||||
|
||||
* Databases created with Alpha 4 will be compatible. (See :ref:`Upgrading from older versions <upgrading-from-older-versions>` for upgrade instructions)
|
||||
|
||||
* Databases created before Alpha 4 will be incompatible. (See :ref:`Upgrading from older versions <upgrading-from-older-versions>` for details)
|
||||
|
||||
Changes to all APIs
|
||||
-------------------
|
||||
|
||||
* The API version has been updated to 14.
|
||||
|
||||
* :ref:`Snapshot reads <snapshot isolation>` (see :ref:`Features <alpha-5-rel-notes-features>`, above).
|
||||
|
||||
* :ref:`Streaming modes <streaming-mode-python>` (see :ref:`Features <alpha-5-rel-notes-features>`, above).
|
||||
|
||||
* Automatic network interface detection (see :ref:`Features <alpha-5-rel-notes-features>`, above).
|
||||
|
||||
* The tuple layer supports unicode strings (encoded as UTF-8), has a more compact encoding, and is not compatible with data from prior versions.
|
||||
|
||||
* Reversed range reads are now exposed through a separate parameter rather than via a negative ``limit``.
|
||||
|
||||
* Extensible options are now exposed at the network, cluster, database and transaction levels. The parameters to :c:func:`fdb_setup_network` and :py:func:`fdb.init` have been replaced by network options.
|
||||
|
||||
* Option enumerations are available in a machine-readable format for the benefit of third-party language binding developers.
|
||||
|
||||
Python API changes
|
||||
------------------
|
||||
|
||||
* :py:func:`fdb.open` can be called with no parameters to use the :ref:`default-cluster-file`.
|
||||
|
||||
* Waiting on a Future object has changed from ``.get()`` to :py:meth:`.wait() <fdb.Future.wait>`
|
||||
|
||||
* Reversed range reads can by specified by passing a slice object with a -1 step.
|
||||
|
||||
* The convenience read methods on :py:class:`fdb.Database` are now transactional.
|
||||
|
||||
C API changes
|
||||
-------------
|
||||
|
||||
* Byte limits exposed in :c:func:`fdb_transaction_get_range`. These are not currently exposed by any of the higher level clients (and usually streaming modes should be preferred).
|
||||
|
||||
* :c:func:`fdb_future_get_keyvalue_array` returns an explicit flag indicating whether there is more data in the range beyond the limits passed to :c:func:`fdb_transaction_get_range`.
|
||||
|
||||
* ``fdb_transaction_get_range_selector`` has been eliminated - :c:func:`fdb_transaction_get_range` always takes key selectors.
|
|
@ -0,0 +1,71 @@
|
|||
#######################
|
||||
Release Notes (Alpha 6)
|
||||
#######################
|
||||
|
||||
FoundationDB Alpha 6
|
||||
====================
|
||||
|
||||
Platform support
|
||||
-------------------------
|
||||
|
||||
* FoundationDB now supports both clients and development servers on :doc:`Mac OS X </getting-started-mac>`.
|
||||
|
||||
* FoundationDB now supports both clients and development servers on (64-bit) Windows.
|
||||
|
||||
* All language APIs are supported on Linux, Mac, and Windows (except for Ruby on Windows, because there is not a 64-bit Ruby for Windows.)
|
||||
|
||||
Features
|
||||
------------
|
||||
|
||||
* The set of coordination servers can be safely :ref:`changed <configuration-changing-coordination-servers>` on-the-fly via the CLI.
|
||||
|
||||
* Unintentional deletion of the coordination state files is now ACID-safe and self-correcting when a majority of the state files still exist.
|
||||
|
||||
* The :ref:`foundationdb.conf <foundationdb-conf>` file format has changed.
|
||||
|
||||
* A new more flexible and automatic system for :ref:`network configuration <foundationdb-conf-fdbserver>`. Common server setups will auto-configure using the cluster file. More advanced setups are supported via separate configurable listen and public addresses.
|
||||
|
||||
* The CLI now support tab-completion.
|
||||
|
||||
* The CLI now supports setting transaction options
|
||||
|
||||
* The CLI has a new command "getrangekeys" that returns the keys in a range and omits the values.
|
||||
|
||||
* The database size estimate shown in the CLI status is much more accurate.
|
||||
|
||||
Performance
|
||||
--------------
|
||||
|
||||
* Improved latency performance for intense workloads with range-read operations.
|
||||
|
||||
* Improved performance and decreased memory usage for certain intense write workloads targeting a small set of keys (such as sequential insert).
|
||||
|
||||
Fixes
|
||||
--------
|
||||
|
||||
* An incorrect result could be returned by a range read when: (1) The range start was specified using a non-default "less than" type key selector; and (2) the range read started at the beginning of the database; and (3) the transaction also included a prior write to a key less than the key of the begin key selector.
|
||||
|
||||
* In certain cases a FoundationDB cluster would not correctly re-configure itself to achieve a more optimal usage of servers of specific machine classes.
|
||||
|
||||
Changes to all APIs
|
||||
-------------------
|
||||
|
||||
* The API version has been updated from 14 to 16. (Thanks to our API versioning technology, programs requesting API version 14 will work unmodified.)
|
||||
|
||||
* Calling the :py:meth:`reset <fdb.Transaction.reset>` method of a transaction now also resets transaction options.
|
||||
|
||||
* :ref:`System keys <system-keys>` (those beginning with the byte ``0xFF``) are now inaccessible by default.
|
||||
|
||||
* Simpler network setup: The network connection options are no longer necessary and have been deprecated.
|
||||
|
||||
* Three new transaction options (:py:meth:`READ_AHEAD_DISABLE <fdb.Transaction.options.set_read_ahead_disable>`, :py:meth:`READ_YOUR_WRITES_DISABLE <fdb.Transaction.options.set_read_your_writes_disable>`, and :py:meth:`ACCESS_SYSTEM_KEYS <fdb.Transaction.options.set_access_system_keys>`) enable more control for advanced applications.
|
||||
|
||||
Changes to the Java API
|
||||
------------------------
|
||||
|
||||
* A new construct `AsyncUtil.whileTrue() <../javadoc/com/apple/cie/foundationdb/async/AsyncUtil.html#whileTrue-com.apple.foundationdb.async.Function->`_ simplifies writing loops using the asynchronous version of the Java FDB client.
|
||||
|
||||
Earlier release notes
|
||||
---------------------
|
||||
|
||||
For changes in alpha 5, see :doc:`Release Notes (Alpha 5) <release-notes-014>`.
|
|
@ -0,0 +1,85 @@
|
|||
######################
|
||||
Release Notes (Beta 1)
|
||||
######################
|
||||
|
||||
Beta 1
|
||||
======
|
||||
|
||||
Platform support
|
||||
----------------
|
||||
|
||||
* Added AWS CloudFormation support for FoundationDB.
|
||||
|
||||
Features
|
||||
--------
|
||||
|
||||
* Servers can be safely :ref:`removed <removing-machines-from-a-cluster>` from the cluster.
|
||||
|
||||
* :ref:`Improved status <administration-monitoring-cluster-status>` with information about database configuration, health, workload, and performance.
|
||||
|
||||
* Improved resiliency against low disk space conditions.
|
||||
|
||||
* The CLI can automatically choose :ref:`coordination servers <configuration-changing-coordination-servers>`.
|
||||
|
||||
* The CLI allows multiple semicolon separated commands per line; a new --exec flag was added to the CLI to pass commands to the CLI and quit when done.
|
||||
|
||||
* Old :ref:`log files <administration-managing-trace-files>` are automatically deleted.
|
||||
|
||||
* More specific :ref:`error codes <developer-guide-error-codes>`.
|
||||
|
||||
Performance
|
||||
-----------
|
||||
|
||||
* Reduced latency of getRange when iterating through large amounts of data.
|
||||
|
||||
* Reduced idle CPU usage.
|
||||
|
||||
* Java API: Join in ArrayUtil is efficient for all container types.
|
||||
|
||||
* Java API: Optimized tuple creation.
|
||||
|
||||
Changes to all APIs
|
||||
-------------------
|
||||
|
||||
* The API version has been updated from 16 to 21. (Thanks to our API versioning technology, programs requesting earlier API versions will work unmodified.) There are no changes required to migrate from version 16 to 21.
|
||||
|
||||
Fixes
|
||||
-----
|
||||
|
||||
* Commit could return the error commit_conflict (renamed to not_committed) after the transaction successfully committed. (This was previously documented as a known limitation.)
|
||||
|
||||
* If a call to commit returned an error, but onError was not called, the transaction would not be reset.
|
||||
|
||||
* The memory storage engine was too aggressive in reserving disk space.
|
||||
|
||||
* If a key selector in a getRange resolved to the beginning or the end of the database, then its transaction may not have correctly conflicted with other transactions.
|
||||
|
||||
* Ranges passed to clearRange and getRange with the begin key larger than the end could incorrectly cause client API errors.
|
||||
|
||||
* Databases with small amounts of data in them (~20000 bytes) would sometimes slowly move data back and forth between the servers.
|
||||
|
||||
* Large network latencies (> ~250 ms) could impede data balancing between servers.
|
||||
|
||||
* Setting callbacks or calling ``blockUntilReady`` on a future from multiple threads resulted in an error.
|
||||
|
||||
* If a machine running the memory storage engine was killed multiple times in close succession, data loss might occur.
|
||||
|
||||
* C: The headers were not standards compliant and would not compile in some environments.
|
||||
|
||||
* Ruby: API versions were not checked for validity.
|
||||
|
||||
* Windows: The server could crash on non-English versions of Windows.
|
||||
|
||||
* Windows: Manually running fdbserver.exe could fail because of overly restrictive permissions set on shared resources.
|
||||
|
||||
* OS X: Java client had an extraneous linker dependency.
|
||||
|
||||
* Java: In multithreaded conditions, getRange and AsyncUtil.whileTrue() could sometimes never return.
|
||||
|
||||
* Python/Ruby: In multithreaded conditions, the client worker thread could crash.
|
||||
|
||||
Earlier release notes
|
||||
---------------------
|
||||
|
||||
* :doc:`Alpha 6 (API Version 16) <release-notes-016>`
|
||||
* :doc:`Alpha 5 (API Version 14) <release-notes-014>`
|
|
@ -0,0 +1,65 @@
|
|||
######################
|
||||
Release Notes (Beta 2)
|
||||
######################
|
||||
|
||||
Beta 2
|
||||
======
|
||||
|
||||
Features
|
||||
--------
|
||||
|
||||
* ``fdbcli`` history is stored between sessions; consecutive duplicate commands are stored as a single history entry
|
||||
* The ``fdbcli`` tool prints a minimal cluster status message if an operation does not complete in 5 seconds.
|
||||
|
||||
Performance
|
||||
-----------
|
||||
|
||||
* Support for databases up to 100TB (aggregate key-value size). We recommend you contact us for configuration suggestions for databases exceeding 10TB.
|
||||
* Reduced client CPU usage when returning locally cached values.
|
||||
* Clients do not write to the database if a value is set to its known current value.
|
||||
* Improved transaction queuing behavior when a significant portion of transactions are "System Immediate" priority.
|
||||
* Reduced downtime in certain server-rejoin situations.
|
||||
|
||||
Language APIs
|
||||
-------------
|
||||
|
||||
* All
|
||||
|
||||
* The API version has been updated from 21 to 22. (Thanks to our API versioning technology, programs requesting earlier API versions will work unmodified.) There are no changes required to migrate from version 21 to 22.
|
||||
* The ``open()`` call blocks until the client can communicate with the cluster.
|
||||
|
||||
* Node.js
|
||||
|
||||
* Support for Node.js v0.10.x.
|
||||
* Functions throw errors of type ``FDBError``.
|
||||
* Removed some variables from the global scope.
|
||||
|
||||
* Java
|
||||
|
||||
* Compiles class files with 1.6 source and target flags.
|
||||
* Single-jar packaging for all platforms. (In rare cases, setting the ``FDB_LIBRARY_PATH_FDB_JAVA`` environment variable will be requried if you previously relied on loading the library from a system path.)
|
||||
|
||||
* Ruby
|
||||
|
||||
* Support for Ruby on Windows. Requires Ruby version at least 2.0.0 (x64).
|
||||
* Added implementation of ``on_ready()``.
|
||||
|
||||
Fixes
|
||||
-----
|
||||
|
||||
* Coordinators could fail to respond if they were busy with other work.
|
||||
* Fixed a rare segmentation fault on cluster shutdown.
|
||||
* Fixed an issue where CLI status could sometimes fail.
|
||||
* Status showed the wrong explanation when performance was limited by system write-to-read latency limit.
|
||||
* Fixed a rare issue where a "stuck" process trying to participate in the database could run out of RAM.
|
||||
* Increased robustness of FoundationDB server when loaded with large data sets.
|
||||
* Eliminated certain cases where the data distribution algorithim could do unnecessary splitting and merging work.
|
||||
* Several fixes for rare issues encountered by our fault simulation framework.
|
||||
* Certain uncommon usage of on_ready() in Python could cause segmentation faults.
|
||||
|
||||
Earlier release notes
|
||||
---------------------
|
||||
|
||||
* :doc:`Beta 1 (API Version 21) <release-notes-021>`
|
||||
* :doc:`Alpha 6 (API Version 16) <release-notes-016>`
|
||||
* :doc:`Alpha 5 (API Version 14) <release-notes-014>`
|
|
@ -0,0 +1,64 @@
|
|||
######################
|
||||
Release Notes (Beta 3)
|
||||
######################
|
||||
|
||||
Beta 3
|
||||
======
|
||||
|
||||
The Beta 3 release focuses on major improvements across our language APIs, including new capabilities for locality, watches, transaction cancellation and timeouts, explicit conflict ranges, and atomic operations. It also improves performance and removes known limitations.
|
||||
|
||||
Features
|
||||
--------
|
||||
|
||||
* Discover where keys are physically stored using the new :ref:`locality <api-python-locality>` API.
|
||||
* Create :ref:`watches <api-python-watches>` that asynchronously report changes to the values of specified keys.
|
||||
* :ref:`Cancel <api-python-cancel>` transactions or set them to automatically :ref:`timeout <api-python-timeout>` and cancel.
|
||||
* Explicitly add read or write :ref:`conflict ranges <api-python-conflict-ranges>`.
|
||||
* Perform :ref:`atomic operations <api-python-transaction-atomic-operations>` that transform a value (e.g. incrementing it) without client reads to avoid transaction conflicts.
|
||||
* API version updated to 23.
|
||||
|
||||
Java
|
||||
----
|
||||
|
||||
Based on customer feedback and internal testing, the API has been significantly revised for increased performance and ease of use. This is a **breaking** API change. We will continue to make the previous JAR
|
||||
available for the time being.
|
||||
|
||||
* The asynchronous programming library has been moved to its own package (``com.foundationdb.async``). The library has a host of new members for greater flexibility and more predictable error handling.
|
||||
* ``Database.run(...)`` can now return an arbitrary object from user code, simplifying use of this recommended retry loop.
|
||||
* The new interface ``Function`` replaces several interfaces: ``Mapper``, ``Block``, ``Retryable``, and ``AsyncRetryable``.
|
||||
* Added the ability to cancel any ``Future`` instance, even one not backed with native resources.
|
||||
* Removed ``onSuccess()`` and ``onFailure()`` in favor of ``map()`` and ``flatMap()``. If code needs simple triggering, ``onReady()`` is still available.
|
||||
* Range iteration via ``Transaction.getRange(...)`` starts fetching data immediately upon invocation. This simplifies development of code that reads ranges in parallel.
|
||||
* Many other changes that facilitate writing fast, efficient, and correct Java applications!
|
||||
|
||||
Python
|
||||
------
|
||||
|
||||
* Python API methods that :ref:`accept a key <api-python-keys>` will also accept a Python object with an ``as_foundationdb_key()`` method that returns a key. Likewise, methods that accept a value will also accept a Python object with an ``as_foundationdb_value()`` method that returns a value.
|
||||
|
||||
Performance
|
||||
-----------
|
||||
|
||||
* Clients can preferentially communicate with servers on the same machine or in the same datacenter for :ref:`location-aware load balancing <api-python-database-options>`.
|
||||
* Removed from the client library debugging code included in versions up through Beta 2, leading to higher, more predictable performance.
|
||||
* Improved data distribution algorithms to optimize data movement during failure scenarios.
|
||||
* Improved range-read iterators in Node.js using lazy evaluation.
|
||||
* Improved client-side range-read prefetching in Node.js, Ruby, and Python.
|
||||
* Incrementally improved performance across all language bindings.
|
||||
|
||||
Fixes
|
||||
-----
|
||||
|
||||
* A storage node could be prevented from rejoining the cluster until the process was restarted.
|
||||
* A reverse ``GetRange`` request using a row limit and an end key selector that enters the system keyspace could return too few results.
|
||||
* A machine power loss immediately following a process restart could result in an invalid transaction log.
|
||||
* ``GetRange`` could improperly cache too large a range of data when the end key selector resolved past the end of user keyspace, temporarily resulting in incorrect answers to read requests.
|
||||
* In Node.js, reusing a range iterator for a second request could result in an incomplete result set.
|
||||
|
||||
Earlier release notes
|
||||
---------------------
|
||||
|
||||
* :doc:`Beta 2 (API Version 22) <release-notes-022>`
|
||||
* :doc:`Beta 1 (API Version 21) <release-notes-023>`
|
||||
* :doc:`Alpha 6 (API Version 16) <release-notes-016>`
|
||||
* :doc:`Alpha 5 (API Version 14) <release-notes-014>`
|
|
@ -0,0 +1,55 @@
|
|||
###################
|
||||
Release Notes (1.0)
|
||||
###################
|
||||
|
||||
1.0.1
|
||||
=====
|
||||
|
||||
* Fix segmentation fault in client when there are a very large number of dependent operations in a transaction and certain errors occur.
|
||||
|
||||
1.0.0
|
||||
=====
|
||||
|
||||
After a year and a half of Alpha and Beta testing, FoundationDB is now commercially available. Thanks to the help of the thousands of Alpha and Beta testers in our community, we believe that this release is highly robust and capable.
|
||||
|
||||
You can now find pricing and order enterprise licenses online.
|
||||
|
||||
The new Community License now permits free-of-charge use for production systems with up to 6 server processes and for non-production systems with an unlimited number of processes.
|
||||
|
||||
There are only minor technical differences between this release and the 0.3.0 release of August 7, 2013:
|
||||
|
||||
Java
|
||||
----
|
||||
* ``clear(Range)`` replaces the now deprecated ``clearRangeStartsWith()``.
|
||||
|
||||
Python
|
||||
------
|
||||
* Windows installer supports Python 3.
|
||||
|
||||
Node and Ruby
|
||||
-------------
|
||||
* String option parameters are converted to UTF-8.
|
||||
|
||||
All
|
||||
---
|
||||
* API version changed to 100. Programs with lower versions continue to work.
|
||||
* Runs on Mac OS X 10.7.
|
||||
* Improvements to installation packages, including package paths and directory modes.
|
||||
* Eliminated cases of excessive resource usage in the locality API.
|
||||
* Watches are disabled when read-your-writes functionality is disabled.
|
||||
* Fatal error paths now call ``_exit()`` instead instead of ``exit()``.
|
||||
|
||||
Fixes
|
||||
-----
|
||||
* A few Python API entry points failed to respect the ``as_foundationdb_key()`` convenience interface.
|
||||
* ``fdbcli`` could print commit version numbers incorrectly in Windows.
|
||||
* Multiple watches set on the same key were not correctly triggered by a subsequent write in the same transaction.
|
||||
|
||||
Earlier release notes
|
||||
---------------------
|
||||
|
||||
* :doc:`Beta 3 (API Version 23) <release-notes-023>`
|
||||
* :doc:`Beta 2 (API Version 22) <release-notes-022>`
|
||||
* :doc:`Beta 1 (API Version 21) <release-notes-021>`
|
||||
* :doc:`Alpha 6 (API Version 16) <release-notes-016>`
|
||||
* :doc:`Alpha 5 (API Version 14) <release-notes-014>`
|
|
@ -0,0 +1,274 @@
|
|||
###################
|
||||
Release Notes (2.0)
|
||||
###################
|
||||
|
||||
2.0.10
|
||||
======
|
||||
|
||||
Release 2.0.10 is protocol-compatible with all prior 2.0.x releases. Users should continue to employ the bindings released with 2.0.0, with the exception of the following bindings:
|
||||
|
||||
* Java - updated to 2.0.8
|
||||
* PHP - updated to 2.0.8
|
||||
* Python - updated to 2.0.6
|
||||
|
||||
Fixes
|
||||
-----
|
||||
|
||||
* Clients running long enough to execute 2\ :sup:`32` internal tasks could experience a reordering of client operations. The outcome of this reordering is undefined and could include crashes or incorrect behavior.
|
||||
* The ``fdbcli`` command-line interface would incorrectly report an internal error when running ``coordinators auto`` if there weren't enough machines in the cluster.
|
||||
|
||||
2.0.9
|
||||
=====
|
||||
|
||||
Release 2.0.9 is protocol-compatible with all prior 2.0.x releases. Users should continue to employ the bindings released with 2.0.0, with the exception of the following bindings:
|
||||
|
||||
* Java - updated to 2.0.8
|
||||
* PHP - updated to 2.0.8
|
||||
* Python - updated to 2.0.6
|
||||
|
||||
Fixes
|
||||
-----
|
||||
|
||||
* Long-running clusters using the ``ssd`` storage engine could eventually deprioritize failure monitoring, causing busy machines to be considered down.
|
||||
|
||||
2.0.8
|
||||
=====
|
||||
|
||||
Release 2.0.8 is protocol-compatible with all prior 2.0.x releases. Users should continue to employ the bindings released with 2.0.0, with the exception of the following bindings:
|
||||
|
||||
* Java - updated to 2.0.8
|
||||
* PHP - updated to 2.0.8
|
||||
* Python - updated to 2.0.6
|
||||
|
||||
Fixes
|
||||
-----
|
||||
|
||||
* Resetting a transaction did not release its memory.
|
||||
* Windows: FoundationDB client applications could crash when starting up due to a race condition.
|
||||
* Ubuntu: Ubuntu software center reported that the FoundationDB package was of bad quality.
|
||||
|
||||
PHP
|
||||
---
|
||||
* Package updated to support PHP 5.4+ (instead of 5.3+).
|
||||
* Fix: ``get_boundary_keys()`` could fail to complete successfully if certain retryable errors were encountered.
|
||||
* Fix: Bindings set error reporting level, which could interfere with clients that used alternate settings.
|
||||
|
||||
Java
|
||||
----
|
||||
* Fix: Calling ``getRange`` on a ``Transaction`` could leak memory.
|
||||
|
||||
2.0.7
|
||||
=====
|
||||
|
||||
Release 2.0.7 is protocol-compatible with all prior 2.0.x releases. Users should continue to employ the bindings released with 2.0.0, with the exception of the following bindings:
|
||||
|
||||
* Java - updated to 2.0.4
|
||||
* PHP - updated to 2.0.6
|
||||
* Python - updated to 2.0.6
|
||||
|
||||
Fixes
|
||||
-----
|
||||
|
||||
* Updated FDBGnuTLS plugin with GnuTLS 3.2.15, incorporating the fix for `GNUTLS-SA-2014-3 <http://gnutls.org/security.html#GNUTLS-SA-2014-3>`_.
|
||||
* Linux and Mac OS X: Processes configured with a 5-digit port number would listen on the wrong port.
|
||||
|
||||
2.0.6
|
||||
=====
|
||||
|
||||
Release 2.0.6 is protocol-compatible with all prior 2.0.x releases. Users should continue to employ the bindings released with 2.0.0, with the exception of the following bindings:
|
||||
|
||||
* Java - updated to 2.0.4
|
||||
* PHP - updated to 2.0.6
|
||||
* Python - updated to 2.0.6
|
||||
|
||||
Fixes
|
||||
-----
|
||||
|
||||
* Memory storage engine files could grow very large in an idle database.
|
||||
|
||||
Performance
|
||||
-----------
|
||||
|
||||
* When disk bound, the storage server would use up to 90% of the disk IOPS on data balancing rather than processing new writes. As a result, the system could perform at about 10% of its maximum speed while restoring durability after a machine failure. The storage server can now use at most 50% of its disk IOPS on data balancing.
|
||||
|
||||
PHP
|
||||
---
|
||||
|
||||
* Fix: ``get_boundary_keys()`` would throw an error if passed a transaction.
|
||||
* Fix: Options which take an integer parameter would not use the value supplied.
|
||||
|
||||
Python
|
||||
------
|
||||
|
||||
* Fix: Python 3 compatibility was broken.
|
||||
* Fix: Choosing a custom Python path in the Windows installer would install to the wrong location.
|
||||
|
||||
2.0.5
|
||||
=====
|
||||
|
||||
Release 2.0.5 is protocol-compatible with 2.0.0, 2.0.1, 2.0.2, 2.0.3, and 2.0.4. Users should continue to employ the bindings released with 2.0.0, with the exception of the Java bindings, which have been updated to 2.0.4, and the PHP bindings, which have been updated to 2.0.5.
|
||||
|
||||
Fixes
|
||||
-----
|
||||
* Clients and servers that specified a cluster file as a filename only (without path) could crash when the coordinators were changed.
|
||||
|
||||
PHP
|
||||
---
|
||||
* Directory layer partitions created with the PHP bindings were incompatible with other language bindings. Contact us if you have data stored in a directory partition created by PHP that can't easily be restored and needs to be migrated.
|
||||
|
||||
2.0.4
|
||||
=====
|
||||
|
||||
Release 2.0.4 is protocol-compatible with 2.0.0, 2.0.1, 2.0.2, and 2.0.3. Users should continue to employ the bindings released with 2.0.0, with the exception of the Java bindings, which have been updated to 2.0.4.
|
||||
|
||||
Fixes
|
||||
-----
|
||||
* Clearing a key larger than the legal limit of 10 kB caused the database to crash and become unreadable.
|
||||
* Explicitly added write conflict ranges were ignored when read-your-writes was disabled.
|
||||
|
||||
Java
|
||||
----
|
||||
* ``ByteArrayUtil.compareUnsigned()`` failed to return in some circumstances.
|
||||
|
||||
2.0.3
|
||||
=====
|
||||
|
||||
Release 2.0.3 is protocol-compatible with 2.0.0, 2.0.1, and 2.0.2. There are no updates to the language bindings, so users should continue to employ the bindings released with 2.0.0.
|
||||
|
||||
Fixes
|
||||
-----
|
||||
* Updated FDBGnuTLS plugin with GnuTLS 3.2.12, incorporating fixes for `GNUTLS-SA-2014-1 <http://gnutls.org/security.html#GNUTLS-SA-2014-1>`_ and `GNUTLS-SA-2014-2 <http://gnutls.org/security.html#GNUTLS-SA-2014-2>`_.
|
||||
* When inserting a large number of keys close to the key size limit, server logs were unexpectedly verbose.
|
||||
|
||||
2.0.2
|
||||
=====
|
||||
|
||||
Release 2.0.2 is protocol-compatible with 2.0.0 and 2.0.1. There are no updates to the language bindings, so users should continue to employ the bindings released with 2.0.0.
|
||||
|
||||
Fixes
|
||||
-----
|
||||
* Windows: Possible database corruption when the FoundationDB service is stopped but unable to kill its child processes.
|
||||
|
||||
2.0.1
|
||||
=====
|
||||
|
||||
Release 2.0.1 is protocol-compatible with 2.0.0. There are no updates to the language bindings, so users should continue to employ the bindings released with 2.0.0.
|
||||
|
||||
Fixes
|
||||
-----
|
||||
* In some cases, a server reincluded after previous exclusion would not participate in data distribution.
|
||||
* Clients could not reliably connect to multiple clusters.
|
||||
* The calculation of usable disk space on Linux and Mac OS X improperly included space reserved for superuser.
|
||||
|
||||
2.0.0
|
||||
=====
|
||||
|
||||
New language support
|
||||
--------------------
|
||||
* `Go <../godoc/fdb.html>`_
|
||||
* PHP
|
||||
|
||||
New layers available in all languages
|
||||
-------------------------------------
|
||||
* The :ref:`Subspace <developer-guide-sub-keyspaces>` layer provides a recommended way to define subspaces of keys by managing key prefixes.
|
||||
* The :ref:`Directory <developer-guide-directories>` layer provides a tool to manage related subspaces as virtual directories. Recommended as a convenient and high-performance way to organize and layout different kinds of data within a single FoundationDB database.
|
||||
|
||||
Security
|
||||
--------
|
||||
* Added certificate-based :doc:`Transport Layer Security </tls>` to encrypt network traffic.
|
||||
|
||||
Monitoring
|
||||
----------
|
||||
* The ``fdbcli`` command-line interface reports information and warnings about available memory.
|
||||
|
||||
Performance
|
||||
-----------
|
||||
* Improved client CPU performance overall.
|
||||
* Greatly improved client CPU performance for range-read operations.
|
||||
* Greatly improved concurrency when issuing writes between reads.
|
||||
* Snapshot reads are now fully cached.
|
||||
* Trade off: ``get_key`` is cached, but ``get_key`` now also retrieves the value of the key, using network bandwidth. (Using ``OPTION_RYW_DISABLE`` will avoid both the cache and the network bandwidth.)
|
||||
* Windows: Improved latencies.
|
||||
|
||||
Fixes
|
||||
-----
|
||||
* In rare cases when many keys very close to the maximum key size are inserted, the database could become unavailable.
|
||||
* ``GetReadVersion`` did not properly throw ``transaction_cancelled`` when called on a transaction that had been cancelled.
|
||||
* When using the ``access_system_keys`` option, a ``get_range_startswith(\xff)`` would incorrectly return no results.
|
||||
* ``get_range_startswith``, when invoked using a key ending in the byte ``\xff``, could return results outside the desired range.
|
||||
* Linux: A process could become unresponsive if unable to find a TCP network device in ``/proc/net/snmp``.
|
||||
* Destroying client threads leaked memory.
|
||||
* Database availability could be unnecessarily compromised in certain rare, low-disk conditions on a "transaction" class machine.
|
||||
* Writing a zero-byte value to the key ``''`` caused the database to crash.
|
||||
* Mac OS X: Power loss could cause data corruption.
|
||||
|
||||
Other changes
|
||||
-------------
|
||||
* To avoid confusing situations, any use of a transaction that is currently committing will cause both the commit and the use to throw a ``used_during_commit`` error.
|
||||
* The ``FDB_CLUSTER_FILE`` environment variable can point to a cluster file that takes precedence over both the current working directory and (e.g., in Linux) ``/etc/foundationdb/fdb.cluster``.
|
||||
* Disabled unloading the ``fdb_c`` library to prevent consequent unavoidable race conditions.
|
||||
* Discontinued testing and support for Ubuntu 11.04. We continue to support Ubuntu 11.10 and later.
|
||||
|
||||
Java
|
||||
----
|
||||
* Support for API version 200 and backwards compatibility with previous API versions.
|
||||
* New APIs for allocating and managing keyspace (:ref:`Directory <developer-guide-directories>`).
|
||||
* In most cases, exceptions thrown in synchronous-style Java programs will have the original calling line of code in the backtrace.
|
||||
* Native resources are handled in a safer and more efficient manner.
|
||||
* Fix: ``AsyncUtil.whenReady`` crashed when the future being waited on was an error.
|
||||
* Fix: Calling ``strinc`` on an empty string or a string containing only ``\xff`` bytes threw an exception.
|
||||
* Fix: Trailing null bytes on the result of ``strinc`` are removed.
|
||||
|
||||
Node
|
||||
----
|
||||
* Support for API version 200 and backwards compatibility with previous API versions.
|
||||
* New APIs for allocating and managing keyspace (:ref:`Directory <developer-guide-directories>`).
|
||||
* Support for the :ref:`Promise/A+ specification <api-node-promises>` with supporting utilities.
|
||||
* Futures can take multiple callbacks. Callbacks can be added if the original function was called with a callback. The Future type is exposed in our binding.
|
||||
* Added ``as_foundationdb_key`` and ``as_foundationdb_value`` support.
|
||||
* Node prints a stack trace if an error occurs in a callback from V8.
|
||||
* Snapshot transactions can be used in retry loops.
|
||||
* The :ref:`methods <api-node-setAndWatch>` ``db.setAndWatch`` and ``db.clearAndWatch`` now return an object with a watch member instead of a future.
|
||||
* Fix: Could not use the ``'this'`` pointer with the retry decorator.
|
||||
* Fix: Node transactional decorator didn't return a result to the caller if the function was called with a transaction.
|
||||
* Fix: The program could sometimes crash when watches were manually cancelled.
|
||||
|
||||
Ruby
|
||||
----
|
||||
* Support for API version 200 and backwards compatibility with previous API versions.
|
||||
* New APIs for allocating and managing keyspace (:ref:`Directory <developer-guide-directories>`).
|
||||
* Tuple and subspace range assume the empty tuple if none is passed.
|
||||
* Added ``as_foundationdb_key`` and ``as_foundationdb_value`` support.
|
||||
* Snapshot transactions can be used in retry loops.
|
||||
* Allow specifying the API version multiple times, so long as the same version is used each time.
|
||||
* Fix: ``FDB.options.set_trace_enable`` threw an exception when passed a ``nil`` value.
|
||||
|
||||
Python
|
||||
------
|
||||
* Support for API version 200 and backwards compatibility with previous API versions.
|
||||
* New APIs for allocating and managing keyspace (:ref:`Directory <developer-guide-directories>`).
|
||||
* Snapshot transactions can be used in retry loops.
|
||||
* Support for gevent 1.0.
|
||||
* Renamed the bitwise atomic operations (``and``, ``or``, ``xor``) to ``bit_and``, ``bit_or``, ``bit_xor``. Added aliases for backwards compatibility.
|
||||
* Fix: ``get_range_startswith`` didn't work with ``as_foundationdb_key``
|
||||
* Fix: ``fdb.locality.get_boundary_keys`` and ``fdb.locality.get_addresses_for_key`` did not support ``as_foundationdb_key``.
|
||||
|
||||
C
|
||||
-
|
||||
* Support for API version 200 and backwards compatibility with previous API versions.
|
||||
|
||||
.NET
|
||||
----
|
||||
* Support for API version 200 and backwards compatibility with previous API versions.
|
||||
* New APIs for allocating and managing keyspace (:ref:`Directory <developer-guide-directories>`).
|
||||
|
||||
|
||||
|
||||
Earlier release notes
|
||||
---------------------
|
||||
* :doc:`1.0 (API Version 100) <release-notes-100>`
|
||||
* :doc:`Beta 3 (API Version 23) <release-notes-023>`
|
||||
* :doc:`Beta 2 (API Version 22) <release-notes-022>`
|
||||
* :doc:`Beta 1 (API Version 21) <release-notes-021>`
|
||||
* :doc:`Alpha 6 (API Version 16) <release-notes-016>`
|
||||
* :doc:`Alpha 5 (API Version 14) <release-notes-014>`
|
|
@ -0,0 +1,194 @@
|
|||
###################
|
||||
Release Notes (3.0)
|
||||
###################
|
||||
|
||||
3.0.8
|
||||
=====
|
||||
|
||||
Release 3.0.8 is protocol-compatible with all prior 3.0.x releases. All users should continue to employ the bindings released with 3.0.2, with the exception of the following:
|
||||
|
||||
* Node.js - updated to 3.0.6
|
||||
* Ruby - updated to 3.0.7
|
||||
* Java - updated to 3.0.8
|
||||
|
||||
Fixes
|
||||
-----
|
||||
* Backup: the backup agent could crash in some circumstances, preventing a backup from completing.
|
||||
* Linux: On some systems, disk space usage tracking could be inaccurate.
|
||||
* In rare cases, range reading could get stuck in an infinite past_version loop.
|
||||
* Range reading with a begin key selector that resolved to the end of the database might not set the correct conflict range.
|
||||
|
||||
Java
|
||||
----
|
||||
* Fix: getBoundaryKeys could throw a NullPointerException.
|
||||
|
||||
3.0.7
|
||||
=====
|
||||
|
||||
Release 3.0.7 is protocol-compatible with all prior 3.0.x releases. All users should continue to employ the bindings released with 3.0.2, with the exception of the following:
|
||||
|
||||
* Node.js - updated to 3.0.6
|
||||
* Ruby - updated to 3.0.7
|
||||
|
||||
Fixes
|
||||
-----
|
||||
* ``fdbcli`` would segmentation fault if there was a semicolon after a quoted string.
|
||||
* :ref:`Atomic operations <api-python-transaction-atomic-operations>` performed on keys that had been :ref:`snapshot read <api-python-snapshot-reads>` would be converted into a set operation.
|
||||
* Reading a key to which an atomic operation had already been applied would cause the read to behave as a snapshot read.
|
||||
* In rare scenarios, it was possible for the memory holding the result of a read to be released when a transaction was reset.
|
||||
* If available RAM was negative, it was reported as a very large number in status.
|
||||
|
||||
Ruby
|
||||
----
|
||||
* Fix: ``FDB`` objects could not be garbage collected.
|
||||
|
||||
3.0.6
|
||||
=====
|
||||
|
||||
Release 3.0.6 is protocol-compatible with all prior 3.0.x releases. All users should continue to employ the bindings released with 3.0.2, with the exception of the following:
|
||||
|
||||
* Node.js - updated to 3.0.6
|
||||
|
||||
Fixes
|
||||
-----
|
||||
* Read-latency probes for status incorrectly returned zero.
|
||||
* Commit-latency probe for status included the time to acquire its read version.
|
||||
* Client and server could crash when experiencing problems with network connections.
|
||||
|
||||
Node.js
|
||||
-------
|
||||
* Fix: npm source package did not compile on Mac OS X 10.9 or newer.
|
||||
|
||||
Windows
|
||||
-------
|
||||
* Added registry key during installation.
|
||||
|
||||
3.0.5
|
||||
=====
|
||||
|
||||
Release 3.0.5 is protocol-compatible with all prior 3.0.x releases. This release contains only a bug fix for Windows packages; Linux and Mac OS X packages for 3.0.5 are identical to those for 3.0.4. All users should continue to employ the bindings released with 3.0.2, with the exception of the following:
|
||||
|
||||
* Node.js - updated to 3.0.3 if downloaded from ``npm``.
|
||||
|
||||
Fixes
|
||||
-----
|
||||
* Windows: fix Visual Studio 2013 code generation bug on older processors or versions of Windows that don't support the AVX instruction set (see https://connect.microsoft.com/VisualStudio/feedback/details/811093).
|
||||
|
||||
3.0.4
|
||||
=====
|
||||
|
||||
Release 3.0.4 is protocol-compatible with all prior 3.0.x releases. Users should continue to employ the bindings released with 3.0.2, with the exception of the following:
|
||||
|
||||
* Node.js - updated to 3.0.3 if downloaded from ``npm``.
|
||||
|
||||
Fixes
|
||||
-----
|
||||
* Mac OS X: backup agent used 100% CPU even when idle.
|
||||
* Backups were inoperative on databases with greater than 32-bit versions.
|
||||
* Backup agents were not started on Windows.
|
||||
* Restore required write permissions on files.
|
||||
* The backup client did not report errors properly in all scenarios.
|
||||
* ``fdbserver -v`` did not print the version.
|
||||
|
||||
Node.js
|
||||
-------
|
||||
* Fixed a compilation problem on Linux and Mac OS X as distributed on ``npm``. (Note: The corrected binding is distributed as version 3.0.3.)
|
||||
|
||||
3.0.2
|
||||
=====
|
||||
|
||||
Upgrades
|
||||
--------
|
||||
|
||||
* When upgrading from version 2.0.x to 3.0.x, you should consult :ref:`Upgrading to 3.0 <upgrading-from-older-versions>`.
|
||||
|
||||
Features
|
||||
--------
|
||||
|
||||
* Status information provided in :doc:`machine-readable JSON </mr-status>` form.
|
||||
* Differential backups and backup of selective keyspaces added to :ref:`backup tool <backup-wait>`.
|
||||
* Clients may retrieve :ref:`machine-readable status <mr-status-key>`, :ref:`cluster filepath, and cluster file contents <cluster-file-client-access>` by reading designated system keys from the database.
|
||||
* Two new :ref:`atomic operations <api-python-transaction-atomic-operations>`: max and min.
|
||||
|
||||
Performance
|
||||
-----------
|
||||
|
||||
* Increased maximum writes per second from 200,000 to 11,000,000.
|
||||
* Improved latencies, particularly on underutilized clusters.
|
||||
* Improved performance of backup and restore.
|
||||
* Improved client CPU usage.
|
||||
* Better rate-limiting when committing very large transactions.
|
||||
* Improved performance while servers rejoin the cluster.
|
||||
|
||||
Fixes
|
||||
-----
|
||||
|
||||
* B-tree vacuuming could exhibit poor performance after large deletions of data.
|
||||
* Computation of memory availability was not correct on newer Linux versions.
|
||||
* Integers could overflow when setting range limits.
|
||||
* With the memory storage engine, a key could be lost after multiple reboots in quick succession.
|
||||
|
||||
Client
|
||||
------
|
||||
|
||||
* Support for API version 300 and backwards compatible with previous API versions.
|
||||
* By default, :ref:`snapshot reads <snapshot isolation>` see writes within the same transaction. The previous behavior can be achieved using transaction options.
|
||||
* The :ref:`transaction size limit <large-transactions>` includes conflict ranges.
|
||||
* Explicitly added read or write :ref:`conflict ranges <api-python-conflict-ranges>` and :ref:`watches <api-python-watches>` for keys that begin with ``\xFF`` require one of the transaction options ``access_system_keys`` or ``read_system_keys`` to be set.
|
||||
* New network options for ``trace_max_logs_size`` and ``trace_roll_size`` for an individual client's trace files.
|
||||
* New transaction options: max_retry_delay, read_system_keys.
|
||||
* All errors cause :ref:`watches <api-python-watches>` to trigger.
|
||||
* All errors cause a transaction to reset (previously true only of some errors).
|
||||
|
||||
Java
|
||||
----
|
||||
|
||||
* ``ReadTransactionContext`` added next to ``TransactionContext``, allowing ``read()`` and ``readAsync()`` composable read-only operations on transactions.
|
||||
* The ``Future`` interface adds ``getInterruptibly()`` and ``blockInterruptibly()``, which propagate ``InterruptedExcetption`` to the calling code.
|
||||
* Exception-handling logic is reworked in ``map()``, ``flatMap()``, and ``rescue()`` to propagate ``OutOfMemoryError`` and ``RejectedExecutionException`` instead of the spurious ``SettableAlreadySet`` exception.
|
||||
* Performance is improved for applications that use many blocking-style ``get()`` calls.
|
||||
|
||||
Node.js
|
||||
-------
|
||||
* Fix: ``fdb.open``, ``fdb.createCluster``, and ``cluster.openDatabase`` didn't use the callback in API versions 22 or lower.
|
||||
* Tuple performance is improved.
|
||||
|
||||
PHP
|
||||
---
|
||||
|
||||
* Snapshot reads have a ``transact`` function.
|
||||
|
||||
Python
|
||||
------
|
||||
|
||||
* Bindings work in Cygwin.
|
||||
* The :ref:`transactional decorator <api-python-transactional-decorator>` no longer warns of a transaction approaching the 5 second limit.
|
||||
|
||||
Ruby
|
||||
----
|
||||
|
||||
* Fix: ``db.get``, ``get_key``, and ``get_and_watch`` returned Futures instead of actual values.
|
||||
|
||||
Other changes
|
||||
-------------
|
||||
|
||||
* Versions increase by 1 million per second instead of 1 thousand per second.
|
||||
* Removed support for Ubuntu 11.10.
|
||||
* Python binding has been removed from Linux packages.
|
||||
* In ``fdbcli``, ``getrange`` does a prefix range read if no end key is specified.
|
||||
* In ``fdbcli``, added an option to disable the initial status check.
|
||||
|
||||
Note on version numbers
|
||||
-----------------------
|
||||
|
||||
Version 3.0.2 is the first publicly released version in the 3.0.x series. Versions 3.0.0-1 were limited-availability releases with the same feature set.
|
||||
|
||||
Earlier release notes
|
||||
---------------------
|
||||
* :doc:`2.0 (API Version 200) <release-notes-200>`
|
||||
* :doc:`1.0 (API Version 100) <release-notes-100>`
|
||||
* :doc:`Beta 3 (API Version 23) <release-notes-023>`
|
||||
* :doc:`Beta 2 (API Version 22) <release-notes-022>`
|
||||
* :doc:`Beta 1 (API Version 21) <release-notes-021>`
|
||||
* :doc:`Alpha 6 (API Version 16) <release-notes-016>`
|
||||
* :doc:`Alpha 5 (API Version 14) <release-notes-014>`
|
|
@ -0,0 +1,69 @@
|
|||
###################
|
||||
Release Notes (4.0)
|
||||
###################
|
||||
|
||||
4.0.2
|
||||
=====
|
||||
|
||||
Fixes
|
||||
-----
|
||||
|
||||
* Streaming mode ``EXACT`` was ignoring the ``target_bytes`` parameter.
|
||||
|
||||
Java
|
||||
----
|
||||
|
||||
* Added a ``toString`` method to the Tuple class.
|
||||
|
||||
4.0.1
|
||||
=====
|
||||
|
||||
Fdbcli
|
||||
------
|
||||
|
||||
* Added a "configure auto" command which will recommend a setting for proxies and logs (not resolvers) along with machine class changes.
|
||||
* Added a "setclass" command which can change a processes machine class from the cli.
|
||||
|
||||
Performance
|
||||
-----------
|
||||
|
||||
* Improved the recovery speed of the transaction subsystem.
|
||||
* Improved the stability of the transaction rate under saturating workloads.
|
||||
* Made the transaction log more memory efficient.
|
||||
|
||||
Features
|
||||
--------
|
||||
|
||||
* Added support for Versionstamp atomic operations.
|
||||
|
||||
Fixes
|
||||
-----
|
||||
|
||||
* It was not safe to allocate multiple directories concurrently in the same transaction in the directory layer.
|
||||
|
||||
Java
|
||||
----
|
||||
|
||||
* Changed the package for the Java bindings from com.foundationdb to com.apple.cie.foundationdb.
|
||||
|
||||
Python
|
||||
------
|
||||
|
||||
* Tuple support for integers up to 255 bytes.
|
||||
|
||||
Other changes
|
||||
-------------
|
||||
|
||||
* Added detailed metric logging available through Scope.
|
||||
* An optional configuration parameter has been added that allows you to specify a seed cluster file.
|
||||
|
||||
Earlier release notes
|
||||
---------------------
|
||||
* :doc:`3.0 (API Version 300) <release-notes-300>`
|
||||
* :doc:`2.0 (API Version 200) <release-notes-200>`
|
||||
* :doc:`1.0 (API Version 100) <release-notes-100>`
|
||||
* :doc:`Beta 3 (API Version 23) <release-notes-023>`
|
||||
* :doc:`Beta 2 (API Version 22) <release-notes-022>`
|
||||
* :doc:`Beta 1 (API Version 21) <release-notes-021>`
|
||||
* :doc:`Alpha 6 (API Version 16) <release-notes-016>`
|
||||
* :doc:`Alpha 5 (API Version 14) <release-notes-014>`
|
|
@ -0,0 +1,53 @@
|
|||
###################
|
||||
Release Notes (4.1)
|
||||
###################
|
||||
|
||||
4.1.1
|
||||
=====
|
||||
|
||||
Fixes
|
||||
-----
|
||||
|
||||
* Many short-lived file access metrics were being created.
|
||||
* A completed backup could be improperly marked as incomplete.
|
||||
* In rare scenarios the resolvers could fail to make progress.
|
||||
|
||||
4.1.0
|
||||
=====
|
||||
|
||||
Performance
|
||||
-----------
|
||||
|
||||
* Significantly improved cluster performance in a wide variety of machine failure scenarios.
|
||||
|
||||
Features
|
||||
--------
|
||||
|
||||
* Clients can now load multiple versions of the client library, and will gracefully switch to the appropriate version when the server is upgraded.
|
||||
* A new operating mode for ``fdbbackup`` writes backup data files into the blob store.
|
||||
* Transactions no longer automatically reset after a successful commit.
|
||||
* Added ability to set network options with environment variables.
|
||||
* Added a new API function for determining the value to which atomic versionstamp operations in a transaction were transformed or would have been transformed.
|
||||
* Improved logic for integrating manually-assigned machine classes with other constraints on role locations.
|
||||
* Added a new machine class ``stateless`` which is the top priority location for resolvers, proxies, and masters.
|
||||
* Added a new machine class ``log`` which is the top priority location for transaction logs.
|
||||
* Trace events are now event metrics that are exposed in Scope.
|
||||
|
||||
Fixes
|
||||
-----
|
||||
|
||||
* A log could attempt to recover from a partially recovered set of logs when fast recovery was enabled.
|
||||
* A rare scenario could cause a crash when a master is recovering metadata from the previous generation of logs.
|
||||
* Streaming mode ``EXACT`` was ignoring the ``target_bytes`` parameter.
|
||||
|
||||
Earlier release notes
|
||||
---------------------
|
||||
* :doc:`4.0 (API Version 400) <release-notes-400>`
|
||||
* :doc:`3.0 (API Version 300) <release-notes-300>`
|
||||
* :doc:`2.0 (API Version 200) <release-notes-200>`
|
||||
* :doc:`1.0 (API Version 100) <release-notes-100>`
|
||||
* :doc:`Beta 3 (API Version 23) <release-notes-023>`
|
||||
* :doc:`Beta 2 (API Version 22) <release-notes-022>`
|
||||
* :doc:`Beta 1 (API Version 21) <release-notes-021>`
|
||||
* :doc:`Alpha 6 (API Version 16) <release-notes-016>`
|
||||
* :doc:`Alpha 5 (API Version 14) <release-notes-014>`
|
|
@ -0,0 +1,35 @@
|
|||
###################
|
||||
Release Notes (4.2)
|
||||
###################
|
||||
|
||||
4.2.1
|
||||
=====
|
||||
|
||||
Fixes
|
||||
-----
|
||||
|
||||
* The Java bindings had an incorrectly named native extension on Linux.
|
||||
|
||||
4.2.0
|
||||
=====
|
||||
|
||||
Features
|
||||
--------
|
||||
|
||||
* A new utility allows :doc:`Backups </backups>` of a cluster directly into another FoundationDB database. It is designed also to support asynchronous replication and disaster recovery.
|
||||
* A new version of the `Java bindings <../javadoc-completable/index.html>`_ adds support for Java 8 Completable Futures.
|
||||
* Information on the versions of connected clients has been added to :doc:`Machine-Readable Status </mr-status>`.
|
||||
* Information on the status of running backups has been added to :doc:`Machine-Readable Status </mr-status>`.
|
||||
|
||||
Earlier release notes
|
||||
---------------------
|
||||
* :doc:`4.1 (API Version 410) <release-notes-410>`
|
||||
* :doc:`4.0 (API Version 400) <release-notes-400>`
|
||||
* :doc:`3.0 (API Version 300) <release-notes-300>`
|
||||
* :doc:`2.0 (API Version 200) <release-notes-200>`
|
||||
* :doc:`1.0 (API Version 100) <release-notes-100>`
|
||||
* :doc:`Beta 3 (API Version 23) <release-notes-023>`
|
||||
* :doc:`Beta 2 (API Version 22) <release-notes-022>`
|
||||
* :doc:`Beta 1 (API Version 21) <release-notes-021>`
|
||||
* :doc:`Alpha 6 (API Version 16) <release-notes-016>`
|
||||
* :doc:`Alpha 5 (API Version 14) <release-notes-014>`
|
|
@ -0,0 +1,37 @@
|
|||
###################
|
||||
Release Notes (4.3)
|
||||
###################
|
||||
|
||||
4.3.0
|
||||
=====
|
||||
|
||||
Features
|
||||
--------
|
||||
|
||||
* Improved DR thoughput by having mutations copied into the DR database before applying them.
|
||||
* Renamed db_agent to dr_agent.
|
||||
* Added more detailed DR and backup active task detail into layer status.
|
||||
|
||||
Fixes
|
||||
-----
|
||||
|
||||
* Backup seconds behind did not update in continuous mode.
|
||||
* DR layer status did not report correctly.
|
||||
* The Java bindings had an incorrectly named native extension on Linux.
|
||||
* DR status would crash if called before a DR had been started.
|
||||
* Changed the blob restore read pattern to work around blob store issues.
|
||||
* External clients do not load environment variable options.
|
||||
|
||||
Earlier release notes
|
||||
---------------------
|
||||
* :doc:`4.2 (API Version 420) <release-notes-420>`
|
||||
* :doc:`4.1 (API Version 410) <release-notes-410>`
|
||||
* :doc:`4.0 (API Version 400) <release-notes-400>`
|
||||
* :doc:`3.0 (API Version 300) <release-notes-300>`
|
||||
* :doc:`2.0 (API Version 200) <release-notes-200>`
|
||||
* :doc:`1.0 (API Version 100) <release-notes-100>`
|
||||
* :doc:`Beta 3 (API Version 23) <release-notes-023>`
|
||||
* :doc:`Beta 2 (API Version 22) <release-notes-022>`
|
||||
* :doc:`Beta 1 (API Version 21) <release-notes-021>`
|
||||
* :doc:`Alpha 6 (API Version 16) <release-notes-016>`
|
||||
* :doc:`Alpha 5 (API Version 14) <release-notes-014>`
|
|
@ -0,0 +1,73 @@
|
|||
###################
|
||||
Release Notes (4.4)
|
||||
###################
|
||||
|
||||
4.4.2
|
||||
=====
|
||||
|
||||
Features
|
||||
--------
|
||||
|
||||
* Backup's minimum unit of progress is now a single committed version, allowing progress to be made when the database is very unhealthy.
|
||||
|
||||
Fixes
|
||||
--------
|
||||
|
||||
* Options being disabled in fdbcli required an unnecessary parameter.
|
||||
* In rare situations, an incorrect backup index could be written. Contact us if you need to restore data from a v4.4.1 or earlier backup.
|
||||
* A crash could occur on startup in fdbbackup and fdbcli.
|
||||
* A data corruption bug observed on OS X was fixed. The issue has never been observed on other platforms.
|
||||
|
||||
4.4.1
|
||||
=====
|
||||
|
||||
Features
|
||||
--------
|
||||
|
||||
* Added support for streaming writes. This allows a client to load an ordered list of mutations into the database in parallel, and once they are all loaded, the mutations will be applied to the database in order.
|
||||
* DR uses streaming writes to significantly improve throughput.
|
||||
* Restore was rewritten so that many clients can partipate in restoring data, significantly improving restore speed. The command line restore tool interface has been updated to support this new capability.
|
||||
* Cluster files now support comments (using the '#' character).
|
||||
* A wide variety of new client-side statistics are logged in client trace files every 5 seconds.
|
||||
* Status reports the generation of the system. The generation is incremented every time there is a failure (and recovery) in the transaction subsystem.
|
||||
* Added a new machine-wide identification token. This token is used in place of the user-supplied "machine ID" in instances where true physical machine is the unit of interest. This change will allow for reporting tools to output the actual number of physical machines present in a cluster.
|
||||
* Added per-process metrics for total disk capacity and free space to status json output that allow for more repeatable and expected reporting of host disk usage. These metrics are based on the "data-dir" parameter to fdbserver and will be reported without regard to whether the process is using the disk or not.
|
||||
* Added backup size estimates to status json output.
|
||||
* Added process uptime seconds to status json output.
|
||||
* Added a flag indicating whether the database is locked to status json output.
|
||||
|
||||
Fixes
|
||||
-----
|
||||
|
||||
* Only processes which can become logs are counted towards fault tolerance.
|
||||
* A long running process would have a local estimate of time which differed greatly from system clock of the machine the process was running on.
|
||||
* DR errors were not being reported properly in DR status.
|
||||
* Backup and DR layer status expiration and cleanup now use database read version instead of time. <rdar://problem/24805824>
|
||||
|
||||
Java
|
||||
----
|
||||
|
||||
* The `ReadTransaction` interface supports the ability to set transaction options.
|
||||
|
||||
Other Changes
|
||||
-------------
|
||||
|
||||
* Removed support for the old log system (pre 3.0). To upgrade to 4.4+ from a version before 3.0, first upgrade to a version between 3.0 and 4.3.
|
||||
* Removed trace event spam in backup and DR.
|
||||
* Backup and DR only report the most recent error, rather than a list of errors.
|
||||
* Updated language binding 'API version not supported' error message to include the version requested and supported. <rdar://problem/23769929>
|
||||
|
||||
Earlier release notes
|
||||
---------------------
|
||||
* :doc:`4.3 (API Version 430) <release-notes-430>`
|
||||
* :doc:`4.2 (API Version 420) <release-notes-420>`
|
||||
* :doc:`4.1 (API Version 410) <release-notes-410>`
|
||||
* :doc:`4.0 (API Version 400) <release-notes-400>`
|
||||
* :doc:`3.0 (API Version 300) <release-notes-300>`
|
||||
* :doc:`2.0 (API Version 200) <release-notes-200>`
|
||||
* :doc:`1.0 (API Version 100) <release-notes-100>`
|
||||
* :doc:`Beta 3 (API Version 23) <release-notes-023>`
|
||||
* :doc:`Beta 2 (API Version 22) <release-notes-022>`
|
||||
* :doc:`Beta 1 (API Version 21) <release-notes-021>`
|
||||
* :doc:`Alpha 6 (API Version 16) <release-notes-016>`
|
||||
* :doc:`Alpha 5 (API Version 14) <release-notes-014>`
|
|
@ -0,0 +1,143 @@
|
|||
###################
|
||||
Release Notes (4.5)
|
||||
###################
|
||||
|
||||
4.5.6
|
||||
=====
|
||||
|
||||
Fixes
|
||||
-----
|
||||
|
||||
* Disabled debug SevError trace event when applyMutations is cancelled
|
||||
* Fixed problem skipping publishable files
|
||||
* Publish debug symbols for files for all platforms
|
||||
|
||||
4.5.5
|
||||
=====
|
||||
|
||||
Fixes
|
||||
-----
|
||||
|
||||
* DR and backup restoration could stall when encountering transactions larger than 5 MB. <rdar://problem/28744048>
|
||||
* The ``_valid`` field in layer status was missing when the client couldn't communicate with the cluster. <rdar://problem/27643333>
|
||||
* Backup uploads to blobstore were not verifying their checksums. <rdar://problem/28417369>
|
||||
|
||||
4.5.4
|
||||
=====
|
||||
|
||||
Fixes
|
||||
-----
|
||||
|
||||
* Creating transactions with the multi-version client was not thread safe. <rdar://problem/28546688>
|
||||
* The status latency probe is now performed separately from other status gathering code. <rdar://problem/28119480>
|
||||
* Watches could fire early needlessly and in rare cases may not fire when they should. <rdar://problem/27957628>
|
||||
* Change the type of the ``query_queue_max`` attribute in status from an object to an integer. <rdar://problem/26709846>
|
||||
* ``fdbserver`` arguments were not properly preserved in log output. <rdar://problem/28407513>
|
||||
* Increase priority of getting read versions over commits on the master proxy.
|
||||
* Java: Inserting a non-Long number into a Tuple and reading it back out threw a ClassCastException. <rdar://problem/28260594>
|
||||
|
||||
4.5.3
|
||||
=====
|
||||
|
||||
Fdbcli
|
||||
------
|
||||
|
||||
* Fix: key and value output did not escape the backslash ('\\') character. <rdar://problem/27832343>
|
||||
* Spaces in key and value output are no longer escaped. <rdar://problem/27832343>
|
||||
|
||||
Java-completable
|
||||
----------------
|
||||
|
||||
* Fix: Range queries with a limit could sometimes return fewer items than requested. <rdar://problem/27879470>
|
||||
|
||||
Status
|
||||
------
|
||||
|
||||
* Fix: backup and DR ``range_bytes_written`` and ``mutation_log_bytes_written`` are reset when backup or DR is restarted. <rdar://problem/27640774>
|
||||
* Added ``_valid`` and ``_error`` fields to layer status. <rdar://problem/27643333>
|
||||
|
||||
4.5.2
|
||||
=====
|
||||
|
||||
Fixes
|
||||
-----
|
||||
|
||||
* Fixed thread safety issues with backup and DR.
|
||||
* Fixed a load balancing problem when a machine was already failed when a client starts.
|
||||
* Golang: adding an explicit conflict key included the key immediately following in the conflict range.
|
||||
|
||||
4.5.1
|
||||
=====
|
||||
|
||||
Features
|
||||
--------
|
||||
|
||||
* When a process is added to a cluster, it will be given data at a much faster rate. <rdar://problem/24075759>
|
||||
* Improved the speed at which the cluster reacts to a failed process.
|
||||
|
||||
Fixes
|
||||
-----
|
||||
|
||||
* Calculating status in large databases could cause slow tasks. <rdar://problem/25805251>
|
||||
* Fdbrestore printed an error when passed the ``--help`` flag. <rdar://problem/26126793>
|
||||
* A multi-version API external client that was a duplicate could cause a crash at client shutdown. <rdar://problem/25838039>
|
||||
* A multi-version API external client that failed to load would cause a crash at client shutdown. <rdar://problem/26611919>
|
||||
* Status could report an incorrect database size in rare scenarios. <rdar://problem/25295683>
|
||||
* Data distribution would stopped working (both exclude and new recruitment) if a call to commit on the storage server could hang indefinitely. <rdar://problem/26276166>
|
||||
* ``fdbcli`` would quit on some commands that were not errors or continue after some real errors when run with ``--exec``. <rdar://problem/25755317>
|
||||
* Trace files generated by clients using the multi-version API could have name collisions. <rdar://problem/25884486>
|
||||
|
||||
Fdbcli
|
||||
------
|
||||
|
||||
* Client issues are now aggregated by name and have an addresses list. Status details prints a truncated view of these. <rdar://problem/23469072>
|
||||
* Status json lists incompatible clients. <rdar://problem/24415680>
|
||||
* Status json includes processes that couldn't be reached in the processes list. <rdar://problem/26703551>
|
||||
* Exclude does not run if it will drop free space below 10%. This check can be bypassed by adding the word ``FORCE`` as a parameter. <rdar://problem/22922266>
|
||||
* Added write mode, which must be turned on to set or clear keys. <rdar://problem/25779641>
|
||||
* Added the ``kill`` command for killing processes.
|
||||
|
||||
Fdbmonitor
|
||||
----------
|
||||
|
||||
* The restart logic for dead child processes now uses a backoff. <rdar://problem/26100711>
|
||||
* Added option to remove WD40 environment variables. <rdar://problem/26100669>
|
||||
* ``fdbmonitor`` will create the lockfile directory if it doesn't exist. <rdar://problem/26502883>
|
||||
* Added support for monitoring symbolic link changes in the configuration file path. <rdar://problem/26100843>
|
||||
* Added an option to disable killing of child processes when the configuration changes. <rdar://problem/26100939>
|
||||
* Added support for larger process IDs. <rdar://problem/26350469>
|
||||
|
||||
Backup
|
||||
------
|
||||
|
||||
* Blobstore URLs can now contain multiple IP addresses, separated by commas, over which to load balance requests. <rdar://problem/23095572>
|
||||
|
||||
Bindings
|
||||
--------
|
||||
|
||||
* Add error predicate testing to client bindings. This new functionality should help complex use cases write correct transaction retry loops where dispatching on error classes is needed. <rdar://problem/24492860>
|
||||
|
||||
Other Changes
|
||||
-------------
|
||||
|
||||
* Maximum shard size increased from 100MB to 500MB. <rdar://problem/21225031>
|
||||
* Support backslash as an escape character for semicolons and backslashes when setting network options using environment variables. <rdar://problem/23902390>
|
||||
* Add ``logGroup`` attribute to rolled trace events. <rdar://problem/25726509>
|
||||
* Calling get range with a begin key == ``\xff\xff/worker_interfaces`` will return a list of serialized worker interfaces. Calling set with the key ``\xff\xff/reboot_worker`` and a value which is a serialized worker interface will reboot that process. <rdar://problem/26101019>
|
||||
|
||||
|
||||
Earlier release notes
|
||||
---------------------
|
||||
* :doc:`4.4 (API Version 440) <release-notes-440>`
|
||||
* :doc:`4.3 (API Version 430) <release-notes-430>`
|
||||
* :doc:`4.2 (API Version 420) <release-notes-420>`
|
||||
* :doc:`4.1 (API Version 410) <release-notes-410>`
|
||||
* :doc:`4.0 (API Version 400) <release-notes-400>`
|
||||
* :doc:`3.0 (API Version 300) <release-notes-300>`
|
||||
* :doc:`2.0 (API Version 200) <release-notes-200>`
|
||||
* :doc:`1.0 (API Version 100) <release-notes-100>`
|
||||
* :doc:`Beta 3 (API Version 23) <release-notes-023>`
|
||||
* :doc:`Beta 2 (API Version 22) <release-notes-022>`
|
||||
* :doc:`Beta 1 (API Version 21) <release-notes-021>`
|
||||
* :doc:`Alpha 6 (API Version 16) <release-notes-016>`
|
||||
* :doc:`Alpha 5 (API Version 14) <release-notes-014>`
|
|
@ -0,0 +1,130 @@
|
|||
#############
|
||||
Release Notes
|
||||
#############
|
||||
|
||||
4.6.5
|
||||
=====
|
||||
|
||||
Bindings
|
||||
--------
|
||||
|
||||
* Java bindings now perform marshaling off of the network thread. <rdar://problem/32413365>
|
||||
|
||||
|
||||
4.6.4
|
||||
=====
|
||||
|
||||
Features
|
||||
--------
|
||||
|
||||
* Added ability to use --io_trust_seconds in a warn-only mode, which logs a trace event rather than failing the process when a disk operation takes a long time. This is enabled with --io_trust_warn_only. <rdar://problem/32344389>
|
||||
|
||||
Fixes
|
||||
-----
|
||||
|
||||
* Disk operation timeouts now cause the process to restart rather than hang indefinitely. <rdar://problem/31888796>
|
||||
* ``fdbdr switch`` did not start the DR in the opposite direction correctly, resulting in mutations being lost. <rdar://problem/32598128>
|
||||
* Lowered backup and DR batch sizes to avoid large packet warnings. <rdar://problem/30933203>
|
||||
* Remove partial pipelining of tlog commits.
|
||||
|
||||
4.6.3
|
||||
=====
|
||||
|
||||
Features
|
||||
--------
|
||||
|
||||
* Added the ability to run a consistency check of a database using a new server role. <rdar://problem/30903086>
|
||||
|
||||
Fixes
|
||||
-----
|
||||
|
||||
* Added the ability to automatically shutdown processes if a disk operation takes a long time to complete. This is enabled with --io_trust_seconds. <rdar://problem/31229332>
|
||||
* Too many outstanding storage recruitment requests causes the cluster controller to hang. <rdar://problem/30271581>
|
||||
* Corrected issue with Ubuntu installer package on Ubuntu 16.04 not starting daemon. <rdar://problem/27752324>
|
||||
* Package non-Linux builds of JNI component into Java jars. <rdar://problem/30786246>
|
||||
* Published backup-related binaries on macOS were incorrectly pointing to symbolic link specification files. <rdar://problem/31403408>
|
||||
|
||||
Performance
|
||||
-----------
|
||||
|
||||
* We no longer fsync trace files. <rdar://problem/30400189>
|
||||
* Lowered the default bandwidth shard splitting knobs for better performance with hot key ranges. <rdar://problem/30234328>
|
||||
|
||||
4.6.2
|
||||
=====
|
||||
|
||||
Fixes
|
||||
-----
|
||||
|
||||
* The tlog could commit more than 100MB at a time <rdar://problem/29312187>
|
||||
* Metrics with filename component not present in trace events <rdar://problem/29933550>
|
||||
* Setting new locality information causes missing process metrics in status details <rdar://problem/29992530>
|
||||
* FDB processes killed via CLI could hang while killing themselves <rdar://problem/29518674>
|
||||
* Enabled recovery of on-disk data files in the event of a very specific rare corruption situation <rdar://problem/29679886>
|
||||
* Process messages get reported as errors by status, but don't get attributed to a process in the status details list <rdar://problem/29866630>
|
||||
* DR prematurely reported progress for work that needed to be retried <rdar://problem/29741198>
|
||||
|
||||
Performance
|
||||
-----------
|
||||
|
||||
* Storage engine performance improvements to reduce the overhead that ssd-2 requires for its benefits over ssd-1 <rdar://problem/29332661>
|
||||
* Lowered the default fetch keys parallelism to slow down data distribution <rdar://problem/29934862>
|
||||
|
||||
4.6.1
|
||||
=====
|
||||
|
||||
Fixes
|
||||
-----
|
||||
|
||||
* Starting a new DR on a large database can cause the secondary cluster to lose availability <rdar://problem/29422130>
|
||||
* Secondary clusters that have been upgraded were reporting "primary" metrics <rdar://problem/29407318>
|
||||
* Backup and DR could get stuck if too many tasks timed out simultaneously <rdar://problem/29422234>
|
||||
|
||||
4.6.0
|
||||
=====
|
||||
|
||||
Features
|
||||
--------
|
||||
|
||||
* Added a new storage engine type ``ssd-2`` that includes page checksums and more efficient storage of large values. The previous storage engine has been renamed ``ssd-1``, and the storage engine ``ssd`` is an alias for ``ssd-2``. <rdar://problem/28565614> <rdar://problem/28723720>
|
||||
* DR and Restore won't overwrite a non empty database <rdar://problem/27082102> <rdar://problem/27065780>
|
||||
|
||||
Performance
|
||||
-----------
|
||||
|
||||
* Improve performance of the ssd storage engine in databases with large keys or values <rdar://problem/28701207>
|
||||
* Improved cluster recovery speed <rdar://problem/28877814>
|
||||
* Restore is faster due to better load leveling across the keyspace <rdar://problem/27554051>
|
||||
* Reduced the conflict ranges applied for get range calls in rare cases <rdar://problem/28034705>
|
||||
|
||||
Fixes
|
||||
-----
|
||||
|
||||
* Backup to Blobstore sends and verifies MD5 sums for uploads <rdar://problem/23077230>
|
||||
* Backup restoration could be unrestorable in certain cases <rdar://problem/27933144>
|
||||
* Clients using the multi-version client functionality would incorrectly report incompatible connections in status <rdar://problem/28396098>
|
||||
* Backup and DR network metrics were incorrectly reported as 0 <rdar://problem/28589577>
|
||||
* Java: fix race condition when removing an empty directory which could lead to a NoSuchElementException <rdar://problem/28858833>
|
||||
* Fixed a source of potential crashes in fdbcli <rdar://problem/27063940>
|
||||
|
||||
Status
|
||||
------
|
||||
|
||||
* The following fields were added: cluster.data.moving_data.total_written_bytes, cluster.qos.limiting_queue_bytes_storage_server, cluster.qos.worst_version_lag_storage_server, cluster.qos.limiting_version_lag_storage_server, cluster.qos.transaction_per_second_limit, cluster.qos.released_transactions_per_second, cluster.qos.performance_limited_by.reason_id, and cluster.database_available
|
||||
|
||||
Earlier release notes
|
||||
---------------------
|
||||
* :doc:`4.5 (API Version 450) </old-release-notes/release-notes-450>`
|
||||
* :doc:`4.4 (API Version 440) </old-release-notes/release-notes-440>`
|
||||
* :doc:`4.3 (API Version 430) </old-release-notes/release-notes-430>`
|
||||
* :doc:`4.2 (API Version 420) </old-release-notes/release-notes-420>`
|
||||
* :doc:`4.1 (API Version 410) </old-release-notes/release-notes-410>`
|
||||
* :doc:`4.0 (API Version 400) </old-release-notes/release-notes-400>`
|
||||
* :doc:`3.0 (API Version 300) </old-release-notes/release-notes-300>`
|
||||
* :doc:`2.0 (API Version 200) </old-release-notes/release-notes-200>`
|
||||
* :doc:`1.0 (API Version 100) </old-release-notes/release-notes-100>`
|
||||
* :doc:`Beta 3 (API Version 23) </old-release-notes/release-notes-023>`
|
||||
* :doc:`Beta 2 (API Version 22) </old-release-notes/release-notes-022>`
|
||||
* :doc:`Beta 1 (API Version 21) </old-release-notes/release-notes-021>`
|
||||
* :doc:`Alpha 6 (API Version 16) </old-release-notes/release-notes-016>`
|
||||
* :doc:`Alpha 5 (API Version 14) </old-release-notes/release-notes-014>`
|
|
@ -0,0 +1,176 @@
|
|||
#############
|
||||
Release Notes
|
||||
#############
|
||||
|
||||
5.0.7
|
||||
=====
|
||||
|
||||
Fixes
|
||||
-----
|
||||
|
||||
* Blob backups became corrupt when handling non-retryable errors. <rdar://problem/35289547>
|
||||
* Blob backup did not retry all http errors correctly. <rdar://problem/34937616>
|
||||
|
||||
5.0.6
|
||||
=====
|
||||
|
||||
5.0.5
|
||||
=====
|
||||
|
||||
Fixes
|
||||
-----
|
||||
|
||||
* Set a default memory limit of 8GB on all backup and DR executables. This limit is configurable on the command line. <rdar://problem/34744417>
|
||||
* The backup agent would keep attempting to write a file to blob for up to an hour after the task was cancelled. <rdar://problem/34745079>
|
||||
* Incorrect blob backup destination URLs could be parsed as correct but missing IP addresses. <rdar://problem/34751574>
|
||||
* Blob load balancing and per address connection limits have been improved. <rdar://problem/34744419>
|
||||
* Fdbmonitor now supports 0-parameter flags. <rdar://problem/34738924>
|
||||
* The read latencies reported in status were higher than what clients observed. <rdar://problem/33877094>
|
||||
|
||||
5.0.4
|
||||
=====
|
||||
|
||||
Fixes
|
||||
-----
|
||||
|
||||
* Logs continued to make their data persistent to disk after being removed. <rdar://problem/33852607>
|
||||
* Removed logs did not delete their data before shutting down. <rdar://problem/33852342>
|
||||
* In rare scenarios, a disk error which occured during log recruitment could cause the recruitment to hang indefinately.
|
||||
|
||||
5.0.3
|
||||
=====
|
||||
|
||||
Fixes
|
||||
-----
|
||||
|
||||
* In rare scenarios, recovery could get stuck for 10 minutes. <rdar://problem/33782338> <rdar://problem/33780273>
|
||||
* The consistency check did not work on locked databases. <rdar://problem/33241411>
|
||||
* In rare scenarios, backup, DR, or fdbcli could hang indefinitely. <rdar://problem/33763769>
|
||||
* Some transaction log metrics were not being reported. <rdar://problem/30313222>
|
||||
* Some network metrics were reported incorrectly as extremely large numbers. <rdar://problem/32364301> <rdar://problem/32363905>
|
||||
|
||||
5.0.2
|
||||
=====
|
||||
|
||||
Fixes
|
||||
-----
|
||||
|
||||
* Functionality to slowly delete large files with incremental truncation was not enabled. <rdar://problem/33550683>
|
||||
* Fixed a source of crashes from fdbcli. <rdar://problem/32933471>
|
||||
* Data distribution was prematurely reporting that it had started.
|
||||
|
||||
Bindings
|
||||
--------
|
||||
|
||||
* Go: Use fully-qualified import paths for fdb dependencies. <rdar://problem/32932617>
|
||||
|
||||
Other
|
||||
-----
|
||||
|
||||
* Publish header files and static libraries for flow and flow bindings on Linux and macOS. <rdar://problem/33191326>
|
||||
|
||||
5.0.1
|
||||
=====
|
||||
|
||||
Fixes
|
||||
-----
|
||||
|
||||
* Bytes input and bytes durable on the log would drift apart due to rounding errors.
|
||||
|
||||
5.0.0
|
||||
=====
|
||||
|
||||
Features
|
||||
--------
|
||||
|
||||
* All recoveries no longer copy log data before completion. As a result, the fast_recovery_double and fast_recovery_triple configurations have been removed. <rdar://problem/30235865>
|
||||
* Added a new configuration ``three_data_hall`` where a single data hall failure cannot bring down a cluster. <rdar://problem/30822968>
|
||||
* Multiple log processes can be within the same zone. <rdar://problem/29407578>
|
||||
* Clients have access to sampled latency statistics for their operations. <rdar://problem/29757812>
|
||||
* Added network checksums. <rdar://problem/30703358>
|
||||
* Fault tolerance is restored much quicker after a storage server failure. <rdar://problem/30125038>
|
||||
|
||||
Performance
|
||||
-----------
|
||||
|
||||
* Improved recovery speed after rebooting the cluster. <rdar://problem/32956590>
|
||||
* Improved saturation performance of write-heavy workloads. <rdar://problem/30381001>
|
||||
* We no longer require extra log durability for fast recoveries. <rdar://problem/30235865>
|
||||
* Backup/DR now use far less cluster resources while idle. <rdar://problem/28374226> <rdar://problem/28640412>
|
||||
* Improved load balancing performance. <rdar://problem/29289012>
|
||||
* Reduced conflict range sizes when performing get range queries with key selectors such that the resolved begin key is greater than or equal to the resolved end key. <rdar://problem/30561532>
|
||||
* Added functionality to slowly delete large files with incremental truncation. <rdar://problem/30193500>
|
||||
|
||||
Fixes
|
||||
-----
|
||||
|
||||
* Fixed a pathology where multiple successive failures could lead to a long 30+ minute availability outage. <rdar://problem/30235865>
|
||||
* Updated status to have failures of old tlogs included in the failure tolerance calculation. <rdar://problem/30615411>
|
||||
* The fdbserver and fdbbackup processes could return a successful error code after a fatal error. <rdar://problem/31350017>
|
||||
* Fault tolerance did not reflect coordinators sharing the same machine ID. <rdar://problem/31195167>
|
||||
* Prevent the DR seconds behind measurement from potentially returning a negative amount. <rdar://problem/32235105>
|
||||
* Increased the priority of all cluster controller work to prevent the cluster controller from being starved by other work on the same process. <rdar://problem/32958023>
|
||||
* Fixed a rare crash in the DR agent. <rdar://problem/30766452>
|
||||
* fdbcli and fdb_c clients logs had 0 values for most ProcessMetrics log event fields. <rdar://problem/31017524>
|
||||
* DR could get stuck if the time required to copy range data was longer than the task timeout. <rdar://problem/32958570>
|
||||
|
||||
Status
|
||||
------
|
||||
|
||||
* Improved latency probe accuracy when the cluster is loaded. <rdar://problem/30465855>
|
||||
* Report GRV latencies at all priorities in the latency probe. <rdar://problem/30465855>
|
||||
* For the SSD storage engine, available disk space now includes space within data files that is not currently in use and can be reused. <rdar://problem/29998454>
|
||||
* Storage servers report how far they are lagging behind the logs. ``fdbcli`` now reports servers that are lagging sufficiently far behind. <rdar://problem/30166503>
|
||||
* Status json "incompatible_connections" did not work with multiversion clients. <rdar://problem/28396098>
|
||||
* Added connection counts and establish/close metrics to status json. <rdar://problem/28393970>
|
||||
|
||||
Bindings
|
||||
--------
|
||||
|
||||
* API version updated to 500.
|
||||
* Tuples now support single- and double-precision floating point numbers, UUIDs, booleans, and nested tuples. <rdar://problem/30053926>
|
||||
* Add ``TRANSACTION_LOGGING_ENABLE`` transaction option that causes the details of a transaction's operations to be logged to the client trace logs. <rdar://problem/32074484>
|
||||
* Add ``USED_DURING_COMMIT_PROTECTION_DISABLE`` transaction option that prevents operations performed during that transaction's commit from causing the commit to fail. <rdar://problem/30378251>
|
||||
* Add ``ENABLE_SLOW_TASK_PROFILING`` network option that logs backtraces for long running flow tasks. <rdar://problem/30975759>
|
||||
* ``getBoundaryKeys`` can be used on locked databases. <rdar://problem/28760070>
|
||||
* Flow: API versions prior to 500 are no longer supported. <rdar://problem/32433458>
|
||||
* Flow: ``Cluster::createDatabase`` no longer takes a DB name parameter. <rdar://problem/32433458>
|
||||
* Node: API versions prior to 500 are no longer supported. <rdar://problem/32433437>
|
||||
* Node: ``fdb.open`` and ``Cluster.openDatabase`` no longer take a DB name parameter. <rdar://problem/32433437>
|
||||
* Java: API versions prior to 500 are no longer supported. <rdar://problem/30378251>
|
||||
* Java: ``FDB.open`` and ``Cluster.openDatabase`` no longer take a DB name parameter. <rdar://problem/32078379>
|
||||
* Java: Removed ``Transaction.reset`` from the API. <rdar://problem/32409970>
|
||||
* Java: ``Transaction.onError`` invalidates its ``Transaction`` and asynchronously returns a new ``Transaction`` to replace it. <rdar://problem/30378251>
|
||||
* Java: Transactions always enable the ``USED_DURING_COMMIT_PROTECTION_DISABLE`` transaction option, preventing operations that occur during a commit from causing the commit to fail. <rdar://problem/30378251>
|
||||
* Java: There are now options to set the executor for async call backs at the database and transaction level. <rdar://problem/31636701>
|
||||
* Java: Static functions that perform async operations now have overloads that allow the user to specify an executor. <rdar://problem/26143365>
|
||||
* Java: Range class now implements equals, toString, and hashCode methods. <rdar://problem/31790542>
|
||||
* Java: Tuples now expose a "stream" method to get a stream of their objects and "fromStream" to convert streams back into tuples. <rdar://problem/31767147>
|
||||
* Java: Addressed a pathology that made AsyncUtil.whileTrue susceptible to long chains of futures. <rdar://problem/30054445>
|
||||
|
||||
Other Changes
|
||||
-------------
|
||||
|
||||
* Added the ``-v``/``--version`` flag to report version information for the ``fdbcli`` binary <rdar://problem/31091644>
|
||||
* Introduced the ``data_filesystem`` command line argument for the ``fdbserver`` binary to prevent data from being written to the root drive. <rdar://problem/30716138>
|
||||
* Added a ``ClientStart`` trace event to client trace files with details about the client library being used.
|
||||
* fdbserver now rejects all unrecognized command-line arguments. <rdar://problem/31853278>
|
||||
* All fdbserver command-line options now have both short- and long-form equivalents. <rdar://problem/31853278>
|
||||
|
||||
Earlier release notes
|
||||
---------------------
|
||||
* :doc:`4.6 (API Version 460) </old-release-notes/release-notes-460>`
|
||||
* :doc:`4.5 (API Version 450) </old-release-notes/release-notes-450>`
|
||||
* :doc:`4.4 (API Version 440) </old-release-notes/release-notes-440>`
|
||||
* :doc:`4.3 (API Version 430) </old-release-notes/release-notes-430>`
|
||||
* :doc:`4.2 (API Version 420) </old-release-notes/release-notes-420>`
|
||||
* :doc:`4.1 (API Version 410) </old-release-notes/release-notes-410>`
|
||||
* :doc:`4.0 (API Version 400) </old-release-notes/release-notes-400>`
|
||||
* :doc:`3.0 (API Version 300) </old-release-notes/release-notes-300>`
|
||||
* :doc:`2.0 (API Version 200) </old-release-notes/release-notes-200>`
|
||||
* :doc:`1.0 (API Version 100) </old-release-notes/release-notes-100>`
|
||||
* :doc:`Beta 3 (API Version 23) </old-release-notes/release-notes-023>`
|
||||
* :doc:`Beta 2 (API Version 22) </old-release-notes/release-notes-022>`
|
||||
* :doc:`Beta 1 (API Version 21) </old-release-notes/release-notes-021>`
|
||||
* :doc:`Alpha 6 (API Version 16) </old-release-notes/release-notes-016>`
|
||||
* :doc:`Alpha 5 (API Version 14) </old-release-notes/release-notes-014>`
|
|
@ -0,0 +1,34 @@
|
|||
:orphan:
|
||||
|
||||
##########
|
||||
Operations
|
||||
##########
|
||||
|
||||
Ready to operate an externally accessible FoundationDB cluster? You'll find what you need to know here.
|
||||
|
||||
* :doc:`building-cluster` walks you through installation of an externally accessible cluster on one or more machines. Using FoundationDB in this way is not supported on macOS.
|
||||
|
||||
* :doc:`configuration` contains *reference* information for configuring a new cluster. You should read this document before setting up a cluster for performance testing or production use.
|
||||
|
||||
* :doc:`administration` covers administration of an *existing* externally accessible cluster.
|
||||
|
||||
* :doc:`command-line-interface` covers use of the ``fdbcli`` tool.
|
||||
|
||||
* :doc:`mr-status` describes the JSON encoding of a cluster's status information.
|
||||
|
||||
* :doc:`backups` covers the FoundationDB backup tool, which provides an additional level of protection by supporting recovery from disasters or unintentional modification of the database.
|
||||
|
||||
* :doc:`platforms` describes issues on particular platforms that affect the operation of FoundationDB.
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
:titlesonly:
|
||||
:hidden:
|
||||
|
||||
building-cluster
|
||||
configuration
|
||||
administration
|
||||
command-line-interface
|
||||
mr-status
|
||||
backups
|
||||
platforms
|
|
@ -0,0 +1,104 @@
|
|||
###########
|
||||
Performance
|
||||
###########
|
||||
|
||||
FoundationDB uses commodity hardware to provide your applications with millions of database operations per penny.
|
||||
|
||||
Scaling
|
||||
=======
|
||||
|
||||
FoundationDB has an unmatched ability to scale linearly as you add more cores to a cluster.
|
||||
|
||||
.. image:: /images/scaling.png
|
||||
|
||||
Here, a cluster of commodity hardware scales to **8.2 million** operations/sec doing a 90% read and 10% write workload.
|
||||
|
||||
The scaling graph uses a 24-machine EC2 c3.8xlarge cluster in which each machine has a 16-core processor. We ran a FoundationDB server process on each core, yielding a 384-process cluster for the largest test, and scaled the cluster down for each smaller test.
|
||||
|
||||
Scaling is the ability to efficiently deliver operations at different scales. For FoundationDB, the relevant operations are reads and writes, measured in operations per sec. Scale is measured in the number of processes, which will usually track the number of available cores. FoundationDB offers scalability from partial utilization of a single core on a single machine to full utilization of dozens of powerful multi-core machines in a cluster.
|
||||
|
||||
Latency
|
||||
=======
|
||||
|
||||
FoundationDB has low latencies over a broad range of workloads that only increase modestly as the cluster approaches saturation.
|
||||
|
||||
.. image:: /images/latency.png
|
||||
|
||||
When run at less than **75% load**, FoundationDB typically has the following latencies:
|
||||
|
||||
================= =========================
|
||||
================= =========================
|
||||
Start transaction 0.3 - 1ms
|
||||
Read 0.1 - 1ms
|
||||
Set 0 (deferred until commit)
|
||||
Commit 1.5 - 2.5ms
|
||||
================= =========================
|
||||
|
||||
The latency graph uses a 12-machine cluster in which each machine has a 4-core (E3-1240) processor and a single SATA SSD. We ran a FoundationDB server process on each core, yielding a 48-process cluster.
|
||||
|
||||
Latency is the time required to complete a given operation. Latencies in FoundationDB are typically measured in milliseconds (ms). Like all systems, FoundationDB operates at low latencies while under low load and increasing latencies as the load approaches the saturation point. FoundationDB is engineered to keep latencies low even at moderate loads. As loads approach saturation, latencies increase as requests are queued up.
|
||||
|
||||
For FoundationDB, the significant latencies are those experienced by a FoundationDB client as it prepares and submits a transaction. Writes incur no latency until the transaction is committed. There are three actions within a transaction that do incur latency:
|
||||
|
||||
* **Transaction start**. This latency will be experienced as part of the first read in a transaction as the read version is obtained. It will typically be a few milliseconds under moderate load, but under high write loads FoundationDB tries to concentrate most transaction latency here.
|
||||
|
||||
* **Reads**. Individual reads should take under 1 ms with moderate loads. If a transaction performs many reads by waiting for each to complete before starting the next, however, these small latencies can add up. You can thus reduce total latency (and potentially conflicts) by doing as many of your reads as possible in parallel. FoundationDB supports non-blocking reads, so it's easy to perform reads without waiting on them.
|
||||
|
||||
* **Commit**. Transactions that perform writes must be committed, and the commit will not succeed until the transaction is durable with full replication. This latency will average under 3 ms with moderate loads. Only a small part of this latency impacts transaction conflicts.
|
||||
|
||||
Throughput (per core)
|
||||
=====================
|
||||
|
||||
FoundationDB provides good throughput for the full range of read and write workloads, with two fully durable storage engine options.
|
||||
|
||||
.. image:: /images/throughput.png
|
||||
|
||||
FoundationDB offers two :ref:`storage engines <configuration-storage-engine>`, optimized for distinct use cases, both of which write to disk before reporting transactions committed. For each storage engine, the graph shows throughput of a single FoundationDB process running on a **single core** with saturating read/write workloads ranging from 100% reads to 100% writes. Throughput for the unmixed workloads is about:
|
||||
|
||||
========= ========== ==============
|
||||
workload ssd engine memory engine
|
||||
========= ========== ==============
|
||||
Reads 55,000/sec 90,000/sec
|
||||
Writes 20,000/sec 35,000/sec
|
||||
========= ========== ==============
|
||||
|
||||
The throughput graph uses a single FoundationDB server process on a single core (E3-1240).
|
||||
|
||||
Throughput is the total number of operations successfully completed by a system in a given period of time. For FoundationDB, we measure throughput in operations, i.e., some mix of read and writes, per second.
|
||||
|
||||
The memory engine is optimized for datasets that entirely fit in memory, with secondary storage used for durable writes but not reads. The SSD engine is optimized for datasets that do not entirely fit in memory, with some percentage of reads being served from secondary storage.
|
||||
|
||||
Because SATA SSDs are only about 50 times slower than memory, they can be combined with memory to achieve throughputs on the same order of magnitude as memory alone as long as cache-hit rates are reasonable. The SSD engine takes advantage of this property. In contrast, spinning disks are 5,000 times slower than memory and radically degrade throughput as soon as cache hits fall appreciably below 100%.
|
||||
|
||||
FoundationDB will only reach maximum throughputs with a highly concurrent workload. In fact, for a given average latency, concurrency is the main driver of throughput.
|
||||
|
||||
Concurrency
|
||||
===========
|
||||
|
||||
FoundationDB is designed to achieve great performance under high concurrency from a large number of clients.
|
||||
|
||||
.. image:: /images/concurrency.png
|
||||
|
||||
Its asynchronous design allows it to handle very high concurrency, and for a typical workload with 90% reads and 10% writes, maximum throughput is reached at about 200 concurrent operations, achieved with **20** concurrent transactions per FoundationDB process for a workload using 10 ops/transaction.
|
||||
|
||||
The concurrency graph uses a single FoundationDB server process on a single core (E3-1240).
|
||||
|
||||
For a given system, average throughput and latency are related by a ratio known in queuing theory as Little’s Law. The practical application of this law to FoundationDB states::
|
||||
|
||||
throughput = outstanding requests / latency
|
||||
|
||||
The implication of this relation is that, at a given latency, we can maximize throughput only by concurrently submitting enough outstanding requests. A FoundationDB cluster might have a commit latency of 2 ms and yet be capable of far more than 500 commits per second. In fact, tens of thousands of commits per second are easily achievable. To achieve this rate, there must be hundreds of requests happening concurrently. Not having enough pending requests is the single biggest reason for low performance.
|
||||
|
||||
Other Effects
|
||||
=============
|
||||
|
||||
A lot of things affect the simple first-order model of performance you see here. For example:
|
||||
|
||||
* For short periods, higher write throughputs can be absorbed, giving higher performance and keeping latencies low.
|
||||
* Most workloads' reads can be cached, giving higher performance.
|
||||
* Adjacently written keys can be written much faster.
|
||||
* Large keys make the storage engine slower.
|
||||
* Large values cost more to read and write than smaller ones.
|
||||
* Not all CPUs are the same speed.
|
||||
* To keep up with the performance modeled above, your disk subsystem will need to do a little over 1 IOPS per write, and about 1 IOPS per (uncached) read.
|
||||
* Network performance tuning at the operating system level can be very important for both latency and throughput, especially in larger clusters.
|
|
@ -0,0 +1,35 @@
|
|||
|
||||
###############
|
||||
Platform Issues
|
||||
###############
|
||||
|
||||
.. include:: guide-common.rst.inc
|
||||
|
||||
This document describes issues on particular platforms that affect the operation of FoundationDB. See :doc:`known-limitations` for broader limitations relating to FoundationDB's design or its current version.
|
||||
|
||||
.. _platform-centos-gce:
|
||||
|
||||
CentOS 6.2 on GCE
|
||||
=================
|
||||
|
||||
In a newly created instance, the CentOS 6.2 image on Google Compute Engine fails to mount the /dev/shm filesystem (used to implement POSIX shared memory) but still reports it as mounted. This bug causes any application that requires access to shared memory, including FoundationDB, to fail to function.
|
||||
|
||||
To resolve this issue, you may do one of the following:
|
||||
|
||||
* Reboot the newly created instance.
|
||||
* Attempt to unmount the /dev/shm filesystem with ``sudo umount /dev/shm``, which will report that the filesystem is not mounted, and then mount the /dev/shm filesystem with ``sudo mount /dev/shm``, which will succeed.
|
||||
* Specify a machine ID in the :ref:`foundationdb.conf <foundationdb-conf-fdbserver>` file, at which point FoundationDB will not attempt to use shared memory.
|
||||
|
||||
.. _platform-ubuntu-12:
|
||||
|
||||
Ubuntu 12.x
|
||||
===========
|
||||
|
||||
Because of a `bug in the Linux kernel <https://bugzilla.kernel.org/show_bug.cgi?id=43260>`_, **FoundationDB might deadlock when running on Ubuntu 12.04 or 12.10** using the default ext4 filesystem. This was fixed in the 3.7 kernel (released 12/10/2012) thanks to the `hard work of Dmitry Monakhov <http://lkml.indiana.edu/hypermail/linux/kernel/1210.0/03434.html>`_. Versions of Ubuntu 12.04 starting at 12.04.3 use the fixed kernel and are safe to use.
|
||||
|
||||
.. _platform-virtual-box:
|
||||
|
||||
VirtualBox
|
||||
==========
|
||||
|
||||
Running FoundationDB on VirtualBox may result in high idle CPU usage.
|
|
@ -0,0 +1,137 @@
|
|||
###############
|
||||
Priority Queues
|
||||
###############
|
||||
|
||||
:doc:`Python <priority-queues>` **Java**
|
||||
|
||||
Goal
|
||||
====
|
||||
|
||||
Create a data structure for `priority queues <http://en.wikipedia.org/wiki/Priority_queue>`_ supporting operations for push, pop_min, peek_min, pop_max, and peek_max. You may find it helpful to review the :doc:`Queues <queues-java>` recipe before this one.
|
||||
|
||||
Challenge
|
||||
=========
|
||||
|
||||
Allow efficient operations on a shared priority queue by multiple clients acting concurrently.
|
||||
|
||||
Explanation
|
||||
===========
|
||||
|
||||
We can model a priority queue using a key formed from a tuple of three elements: an item's priority, an increasing integer encoding the order in which the item was pushed, and a random element to make the key unique. By making keys unique, we can minimize conflicts for concurrent pushes.
|
||||
|
||||
Ordering
|
||||
========
|
||||
|
||||
The ordering of keys will sort items first by priority, then by push order, then randomly (to break ties in concurrent pushes). The minimum and maximum priority items will always be at the beginning and end of the queue, respectively, allowing us to efficiently peek or pop them.
|
||||
|
||||
Pattern
|
||||
=======
|
||||
|
||||
We create a subspace for the priority queue, which takes care of packing our tuples into byte strings.
|
||||
|
||||
.. code-block:: java
|
||||
|
||||
pq = new Subspace(Tuple.from("P"));
|
||||
|
||||
Push operations will construct a key-value pair with the subspace pq of the form
|
||||
|
||||
.. code-block:: java
|
||||
|
||||
// (priority, count, random) = value
|
||||
|
||||
where priority is supplied by the client, count is an integer that increases by 1 for each item pushed with priority, and random is a randomly generated integer.
|
||||
|
||||
Items of the same priority that are pushed concurrently may occasionally be assigned the same count, but their keys will still be distinct and ordered (in this case, randomly). The count is derived by reading and incrementing the highest count previously used for a given priority. By using a snapshot read, we guarantee that pushing is conflict-free.
|
||||
|
||||
To implement this model, we need an efficient way of finding the first and last key in the queue. (The ordering of keys guarantees that these will always be the proper keys to pop or peek.) FoundationDB's range reads have limit and reverse options that let us accomplish this. We can find the first and last key-value pairs in the range of the pq subspace with:
|
||||
|
||||
.. code-block:: java
|
||||
|
||||
tr.getRange(pq.subspace(Tuple.from(priority)).range(),1) // first
|
||||
tr.getRange(pq.subspace(Tuple.from(priority)).range(),1,true) // last
|
||||
|
||||
Extensions
|
||||
==========
|
||||
|
||||
*High-Contention Pop Operations*
|
||||
|
||||
To minimize conflicts during pop operations, we can use a staging technique to service the requests. If a pop operation doesn't initially succeed, it registers a pop request in a semi-ordered set of such requests. It then enters a retry loop in which it attempts to fulfill outstanding requests.
|
||||
|
||||
Code
|
||||
====
|
||||
|
||||
Here's a basic implementation of the model:
|
||||
|
||||
.. code-block:: java
|
||||
|
||||
import java.util.Random;
|
||||
|
||||
public class MicroPriority {
|
||||
|
||||
private static final FDB fdb;
|
||||
private static final Database db;
|
||||
private static final Subspace pq;
|
||||
private static final Random randno;
|
||||
|
||||
static{
|
||||
fdb = FDB.selectAPIVersion(510);
|
||||
db = fdb.open();
|
||||
pq = new Subspace(Tuple.from("P"));
|
||||
|
||||
randno = new Random();
|
||||
}
|
||||
|
||||
public static void push(TransactionContext tcx, final Object value, final int priority){
|
||||
tcx.run((Transaction tr) -> {
|
||||
byte[] rands = new byte[20];
|
||||
randno.nextBytes(rands);
|
||||
tr.set(pq.subspace(Tuple.from(priority, nextCount(tr,priority),rands)).pack(),
|
||||
Tuple.from(value).pack());
|
||||
return null;
|
||||
});
|
||||
}
|
||||
|
||||
private static long nextCount(TransactionContext tcx, final int priority){
|
||||
return tcx.run((Transaction tr) -> {
|
||||
for(KeyValue kv : tr.snapshot().getRange(pq.subspace(Tuple.from(priority)).range(),1,true)){
|
||||
return 1l + (long)pq.subspace(Tuple.from(priority)).unpack(kv.getKey()).get(0);
|
||||
}
|
||||
|
||||
return 0l; // None previously with this priority.
|
||||
});
|
||||
}
|
||||
|
||||
// Pop--assumes min priority queue..
|
||||
public static Object pop(TransactionContext tcx){
|
||||
return pop(tcx,false);
|
||||
}
|
||||
|
||||
// Pop--allows for either max or min priority queue.
|
||||
public static Object pop(TransactionContext tcx, final boolean max){
|
||||
return tcx.run((Transaction tr) -> {
|
||||
for(KeyValue kv : tr.getRange(pq.range(), 1, max)){
|
||||
tr.clear(kv.getKey());
|
||||
return Tuple.fromBytes(kv.getValue()).get(0);
|
||||
}
|
||||
|
||||
return null;
|
||||
});
|
||||
}
|
||||
|
||||
// Peek--assumes min priority queue.
|
||||
public static Object peek(TransactionContext tcx){
|
||||
return peek(tcx,false);
|
||||
}
|
||||
|
||||
// Peek--allows for either max or min priority queue.
|
||||
public static Object peek(TransactionContext tcx, final boolean max){
|
||||
return tcx.run((Transaction tr) -> {
|
||||
Range r = pq.range();
|
||||
for(KeyValue kv : tr.getRange(r.begin, r.end, 1, max)){
|
||||
return Tuple.fromBytes(kv.getValue()).get(0);
|
||||
}
|
||||
|
||||
return null;
|
||||
});
|
||||
}
|
||||
}
|
|
@ -0,0 +1,90 @@
|
|||
###############
|
||||
Priority Queues
|
||||
###############
|
||||
|
||||
**Python** :doc:`Java <priority-queues-java>`
|
||||
|
||||
Goal
|
||||
====
|
||||
|
||||
Create a data structure for `priority queues <http://en.wikipedia.org/wiki/Priority_queue>`_ supporting operations for push, pop_min, peek_min, pop_max, and peek_max. You may find it helpful to review the :doc:`queues` recipe before this one.
|
||||
|
||||
Challenge
|
||||
=========
|
||||
|
||||
Allow efficient operations on a shared priority queue by multiple clients acting concurrently.
|
||||
|
||||
Explanation
|
||||
===========
|
||||
|
||||
We can model a priority queue using a key formed from a tuple of three elements: an item's priority, an increasing integer encoding the order in which the item was pushed, and a random element to make the key unique. By making keys unique, we can minimize conflicts for concurrent pushes.
|
||||
|
||||
Ordering
|
||||
========
|
||||
|
||||
The ordering of keys will sort items first by priority, then by push order, then randomly (to break ties in concurrent pushes). The minimum and maximum priority items will always be at the beginning and end of the queue, respectively, allowing us to efficiently peek or pop them.
|
||||
|
||||
Pattern
|
||||
=======
|
||||
|
||||
We create a subspace for the priority queue, which takes care of packing our tuples into byte strings.
|
||||
::
|
||||
|
||||
pq = fdb.Subspace(('P',))
|
||||
|
||||
Push operations will construct a key-value pair of the form::
|
||||
|
||||
tr[ pq[ priority ][ count ][ random ] ] = value
|
||||
|
||||
where priority is supplied by the client, count is an integer that increases by 1 for each item pushed with priority, and random is a randomly generated integer.
|
||||
|
||||
Items of the same priority that are pushed concurrently may occasionally be assigned the same count, but their keys will still be distinct and ordered (in this case, randomly). The count is derived by reading and incrementing the highest count previously used for a given priority. By using a snapshot read, we guarantee that pushing is conflict-free.
|
||||
|
||||
To implement this model, we need an efficient way of finding the first and last key in the queue. (The ordering of keys guarantees that these will always be the proper keys to pop or peek.) FoundationDB's range reads have limit and reverse options that let us accomplish this. Given the range of the subspace::
|
||||
|
||||
r = pq.range()
|
||||
|
||||
we can find the first and last key-value pairs in the range with::
|
||||
|
||||
tr.get_range(r.start, r.stop, limit=1) # first
|
||||
tr.get_range(r.start, r.stop, limit=1, reverse=True) # last
|
||||
|
||||
Extensions
|
||||
==========
|
||||
|
||||
*High-Contention Pop Operations*
|
||||
|
||||
To minimize conflicts during pop operations, we can use a staging technique to service the requests. If a pop operation doesn't initially succeed, it registers a pop request in a semi-ordered set of such requests. It then enters a retry loop in which it attempts to fulfill outstanding requests.
|
||||
|
||||
Code
|
||||
====
|
||||
|
||||
Here's a basic implementation of the model::
|
||||
|
||||
import os
|
||||
|
||||
pq = fdb.Subspace(('P',))
|
||||
|
||||
@fdb.transactional
|
||||
def push(tr, value, priority):
|
||||
tr[pq[priority][_next_count(tr, priority)][os.urandom(20)]] = value
|
||||
|
||||
@fdb.transactional
|
||||
def _next_count(tr, priority):
|
||||
r = pq[priority].range()
|
||||
for key, value in tr.snapshot.get_range(r.start, r.stop, limit=1, reverse=True):
|
||||
return pq[priority].unpack(key)[0] + 1
|
||||
return 0
|
||||
|
||||
@fdb.transactional
|
||||
def pop(tr, max=False):
|
||||
r = pq.range()
|
||||
for item in tr.get_range(r.start, r.stop, limit=1, reverse=max):
|
||||
del tr[item.key]
|
||||
return item.value
|
||||
|
||||
@fdb.transactional
|
||||
def peek(tr, max=False):
|
||||
r = pq.range()
|
||||
for item in tr.get_range(r.start, r.stop, limit=1, reverse=max):
|
||||
return item.value
|
|
@ -0,0 +1,131 @@
|
|||
######
|
||||
Queues
|
||||
######
|
||||
|
||||
:doc:`Python <queues>` **Java**
|
||||
|
||||
Challenge
|
||||
=========
|
||||
|
||||
Allow efficient operations on a shared queue by multiple clients acting concurrently.
|
||||
|
||||
Explanation
|
||||
===========
|
||||
|
||||
We can model a queue by assigning increasing integers that encode the order of items. To minimize conflicts for concurrent operations, we combine the integers in a tuple with a random element to make the final key unique.
|
||||
|
||||
Ordering
|
||||
========
|
||||
|
||||
The ordering of keys preserves the FIFO order of items and therefore lets us identify the next item to be dequeued without maintaining a pointer to it.
|
||||
|
||||
Pattern
|
||||
=======
|
||||
|
||||
We store each item in the queue within a subspace, which takes care of packing our integer indexes into byte strings.
|
||||
|
||||
.. code-block:: java
|
||||
|
||||
queue = new Subspace(Tuple.from("Q"));
|
||||
|
||||
As a first cut, we could store each item with a single key-value pair using increasing integer indexes for subsequent items:
|
||||
|
||||
.. code-block:: java
|
||||
|
||||
// (queue, index) = value
|
||||
|
||||
However, this would leave concurrent enqueue operations vulnerable to conflicts. To minimize these conflicts, we can add a random integer to the key.
|
||||
|
||||
.. code-block:: java
|
||||
|
||||
// (queue, index, random) = value
|
||||
|
||||
With this data model, items enqueued concurrently may be assigned the same index, but the keys as a whole will still be ordered (in this case, randomly). By using a :ref:`snapshot read <snapshot isolation>`, we guarantee that enqueuing will be conflict-free.
|
||||
|
||||
To implement this model, we need an efficient way of finding the first and last index presently in use. FoundationDB's range reads have limit and reverse options that let us accomplish this. We can find the first and last key-value pairs in the range of the subspace with:
|
||||
|
||||
.. code-block:: java
|
||||
|
||||
tr.getRange(queue.range(), 1) // first
|
||||
tr.getRange(queue.range(), 1, true) // last
|
||||
|
||||
Extensions
|
||||
==========
|
||||
|
||||
*High-Contention Dequeue Operations*
|
||||
|
||||
To minimize conflicts during dequeue operations, we can use a staging technique to service the requests. If a dequeue operation doesn't initially succeed, it registers a dequeue request in a semi-ordered set of such requests. It then enters a retry loop in which it attempts to fulfill outstanding requests.
|
||||
|
||||
Code
|
||||
====
|
||||
|
||||
The following is a simple implementation of the basic pattern:
|
||||
|
||||
.. code-block:: java
|
||||
|
||||
import java.util.Random
|
||||
|
||||
public class MicroQueue {
|
||||
|
||||
private static final FDB fdb;
|
||||
private static final Database db;
|
||||
private static final Subspace queue;
|
||||
private static final Random randno;
|
||||
|
||||
static{
|
||||
fdb = FDB.selectAPIVersion(510);
|
||||
db = fdb.open();
|
||||
queue = new Subspace(Tuple.from("Q"));
|
||||
randno = new Random();
|
||||
}
|
||||
|
||||
// Remove the top element from the queue.
|
||||
public static Object dequeue(TransactionContext tcx){
|
||||
final KeyValue item = firstItem(tcx);
|
||||
if(item == null){
|
||||
return null;
|
||||
}
|
||||
|
||||
// Remove from the top of the queue.
|
||||
tcx.run((Transaction tr) ->
|
||||
tr.clear(item.getKey());
|
||||
return null;
|
||||
});
|
||||
|
||||
// Return the old value.
|
||||
return Tuple.fromBytes(item.getValue()).get(0);
|
||||
}
|
||||
|
||||
// Add an element to the queue.
|
||||
public static void enqueue(TransactionContext tcx, final Object value){
|
||||
tcx.run((Transaction tr) -> {
|
||||
byte[] rands = new byte[20];
|
||||
randno.nextBytes(rands); // Create random seed to avoid conflicts.
|
||||
tr.set(queue.subspace(Tuple.from(lastIndex(tr)+1, rands)).pack(),
|
||||
Tuple.from(value).pack());
|
||||
|
||||
return null;
|
||||
});
|
||||
}
|
||||
|
||||
// Get the top element of the queue.
|
||||
private static KeyValue firstItem(TransactionContext tcx){
|
||||
return tcx.run((Transaction tr) ->
|
||||
for(KeyValue kv : tr.getRange(queue.range(), 1)){
|
||||
return kv;
|
||||
}
|
||||
|
||||
return null; // Empty queue. Should never be reached.
|
||||
});
|
||||
}
|
||||
|
||||
// Get the last index in the queue.
|
||||
private static long lastIndex(TransactionContext tcx){
|
||||
return tcx.run((Transaction tr) ->
|
||||
for(KeyValue kv : tr.snapshot().getRange(queue.range(), 1, true)){
|
||||
return (long)queue.unpack(kv.getKey()).get(0);
|
||||
}
|
||||
return 0l;
|
||||
});
|
||||
}
|
||||
}
|
|
@ -0,0 +1,89 @@
|
|||
######
|
||||
Queues
|
||||
######
|
||||
|
||||
**Python** :doc:`Java <queues-java>`
|
||||
|
||||
Challenge
|
||||
=========
|
||||
|
||||
Allow efficient operations on a shared queue by multiple clients acting concurrently.
|
||||
|
||||
Explanation
|
||||
===========
|
||||
|
||||
We can model a queue by assigning increasing integers that encode the order of items. To minimize conflicts for concurrent operations, we combine the integers in a tuple with a random element to make the final key unique.
|
||||
|
||||
Ordering
|
||||
========
|
||||
|
||||
The ordering of keys preserves the FIFO order of items and therefore lets us identify the next item to be dequeued without maintaining a pointer to it.
|
||||
|
||||
Pattern
|
||||
=======
|
||||
|
||||
We store each item in the queue within a subspace, which takes care of packing our integer indexes into byte strings.
|
||||
::
|
||||
|
||||
queue = fdb.Subspace(('Q',))
|
||||
|
||||
As a first cut, we could store each item with a single key-value pair using increasing integer indexes for subsequent items:
|
||||
::
|
||||
|
||||
tr[queue[index]] = value
|
||||
|
||||
However, this would leave concurrent enqueue operations vulnerable to conflicts. To minimize these conflicts, we can add a random integer to the key.
|
||||
::
|
||||
|
||||
tr[queue[index][random_int]] = value
|
||||
|
||||
With this data model, items enqueued concurrently may be assigned the same index, but the keys as a whole will still be ordered (in this case, randomly). By using a :ref:`snapshot read <snapshot isolation>`, we guarantee that enqueuing will be conflict-free.
|
||||
|
||||
To implement this model, we need an efficient way of finding the first and last index presently in use. FoundationDB's range reads have limit and reverse options that let us accomplish this. Given the range of the subspace::
|
||||
|
||||
r = queue.range()
|
||||
|
||||
we can find the first and last key-value pairs in the range with::
|
||||
|
||||
tr.get_range(r.start, r.stop, limit=1) # first
|
||||
tr.get_range(r.start, r.stop, limit=1, reverse=True) # last
|
||||
|
||||
Extensions
|
||||
==========
|
||||
|
||||
*High-Contention Dequeue Operations*
|
||||
|
||||
To minimize conflicts during dequeue operations, we can use a staging technique to service the requests. If a dequeue operation doesn't initially succeed, it registers a dequeue request in a semi-ordered set of such requests. It then enters a retry loop in which it attempts to fulfill outstanding requests.
|
||||
|
||||
Code
|
||||
====
|
||||
|
||||
The following is a simple implementation of the basic pattern::
|
||||
|
||||
import os
|
||||
|
||||
queue = fdb.Subspace(('Q',))
|
||||
|
||||
@fdb.transactional
|
||||
def dequeue(tr):
|
||||
item = first_item(tr)
|
||||
if item is None: return None
|
||||
del tr[item.key]
|
||||
return item.value
|
||||
|
||||
@fdb.transactional
|
||||
def enqueue(tr, value):
|
||||
tr[queue[last_index(tr) + 1][os.urandom(20)]] = value
|
||||
|
||||
@fdb.transactional
|
||||
def last_index(tr):
|
||||
r = queue.range()
|
||||
for key, _ in tr.snapshot.get_range(r.start, r.stop, limit=1, reverse=True):
|
||||
return queue.unpack(key)[0]
|
||||
return 0
|
||||
|
||||
@fdb.transactional
|
||||
def first_item(tr):
|
||||
r = queue.range()
|
||||
for kv in tr.get_range(r.start, r.stop, limit=1):
|
||||
return kv
|
|
@ -0,0 +1,128 @@
|
|||
#############
|
||||
Release Notes
|
||||
#############
|
||||
|
||||
5.1.0
|
||||
=====
|
||||
|
||||
Features
|
||||
--------
|
||||
|
||||
* Backups continually write snapshots at a configured interval, reducing restore times for long running backups. <rdar://problem/25512772>
|
||||
* Old backup snapshots and associated logs can be deleted from a backup. <rdar://problem/25512772>
|
||||
* Backup files are stored in a deep folder structure. <rdar://problem/27723412>
|
||||
* Restore allows you to specify an approximate time instead of a version. <rdar://problem/34557380>
|
||||
* Backup and DR agents can be paused from ``fdbbackup`` and ``fdbdr`` respectively. <rdar://problem/34776039>
|
||||
* Added byte min and byte max atomic operations. <rdar://problem/29255441>
|
||||
* The behavior of atomic "and" and "min" operations has changed when the key doesn't exist in the database. If the key is not present, then an "and" or "min" is now equivalent to a set. <rdar://problem/29255441>
|
||||
* Exception messages are more descriptive. <rdar://problem/33665340>
|
||||
* Clients can view a sample of committed mutations. <rdar://problem/33324935>
|
||||
* When switching to a DR cluster, the commit verisons on that cluster will be higher than the versions on the primary cluster. <rdar://problem/33572665>
|
||||
* Added a read-only lock aware transaction option. <rdar://problem/34579176>
|
||||
* Automatically suppress trace log events which occur too frequently. <rdar://problem/33764208>
|
||||
* Added a new ``multi_dc`` replication mode designed for cross data center deployments. <rdar://problem/36489132>
|
||||
|
||||
Performance
|
||||
-----------
|
||||
|
||||
* The data distribution algorithm can split the system keyspace. <rdar://problem/29932360>
|
||||
* Improved load balancing when servers are located across multiple data centers. <rdar://problem/34213649>
|
||||
* Improved read latencies after recoveries by only making servers responsible for keys if they have finished copying the data from other servers. <rdar://problem/34697182>
|
||||
* Improved recovery times by waiting until a process has finished recovering its data from disk before letting it be recruited for new roles. <rdar://problem/32000146> <rdar://problem/34212951>
|
||||
* Improved 95% read version latencies by reducing the number of logs required to confirm that a proxy has not been replaced. <rdar://problem/33196298>
|
||||
* Stopped the transaction logs from copying unneeded data after multiple successive recoveries. <rdar://problem/36488946>
|
||||
* Significantly improved the performance of range reads. <rdar://problem/33926224>
|
||||
* The cluster controller prefers to be recruited on stateless class processes and will not put other stateless roles on the same process. <rdar://problem/35155324>
|
||||
* Excluded servers no longer take on stateless roles. <rdar://problem/27110802>
|
||||
* Stateless roles will be proactively moved off of excluded processes. <rdar://problem/27110802> <rdar://problem/35155044>
|
||||
* Dramatically improved restore speeds of large disk queue files. <rdar://problem/35567320>
|
||||
* Clients get key location information directly from the proxies, sigificantly reducing the latency of worst case read patterns. <rdar://problem/35953920>
|
||||
* Reduced the amount of work incompatible clients generate for coordinators and the cluster controller. In particular, this reduces the load on the cluster caused by using the multi-version client. <rdar://problem/30897631>
|
||||
* Pop partially recovered mutations from the transaction log to save disk space after multiple successive recoveries. <rdar://problem/33755270>
|
||||
* Stopped using network checksums when also using TLS. <rdar://problem/32157852>
|
||||
* Improved cluster performance after recoveries by prioritizing processing new mutations on the logs over copying data from the previous logs. <rdar://problem/36489337>
|
||||
* Backup agents prefer reading from servers in the same data center. <rdar://problem/34213617>
|
||||
|
||||
Fixes
|
||||
-----
|
||||
|
||||
* New databases immediately configured into ``three_data_hall`` would not respect the ``three_data_hall`` constraint. <rdar://problem/34415440>
|
||||
* Exclude considered the free space of non-storage processes when determining if an exclude was safe.
|
||||
* ``fdbmonitor`` failed to start processes after fork failure. <rdar://problem/34743257>
|
||||
* ``fdbmonitor`` will only stop processes when the configuration file is deleted if ``kill_on_configuration_change`` is set. <rdar://problem/35497412>
|
||||
* The data distribution algorithm would hang indefinately when asked to build storage teams with more than three servers.
|
||||
* Mutations from a restore could continue to be applied for a very short amount of time after a restore was successfully aborted.
|
||||
|
||||
Extremely Rare Bug Fixes
|
||||
------------------------
|
||||
|
||||
* Storage servers did not properly handle rollbacks to versions before their restored version.
|
||||
* A newly recruited transaction log configured with the memory storage engine could crash on startup.
|
||||
* The data distribution algorithm could split a key range so that one part did not have any data.
|
||||
* Storage servers could update to an incorrect version after a master failure.
|
||||
* The disk queue could report a commit as successful before the sync of the disk queue files completed.
|
||||
* A disk queue which was shutdown before completing its first commit could become unrecoverable.
|
||||
|
||||
Status
|
||||
------
|
||||
|
||||
* If a cluster cannot recover because too many transaction logs are missing, status lists the missing logs. <rdar://problem/34965531>
|
||||
* The list of connected clients includes their trace log groups. <rdar://problem/33779874>
|
||||
* Status reports if a cluster is being used as a DR destination. <rdar://problem/34971187>
|
||||
|
||||
Bindings
|
||||
--------
|
||||
|
||||
* API version updated to 510.
|
||||
* Add versionstamp support to the Tuple layer in Java and Python. <rdar://problem/25560444>
|
||||
|
||||
Java
|
||||
----
|
||||
|
||||
* API versions prior to 510 are no longer supported.
|
||||
* The bindings have been moved to the package ``com.apple.foundationdb`` from ``com.apple.cie.foundationdb``. <rdar://problem/33271641>
|
||||
* We no longer offer a version of the Java bindings with our custom futures library or support Java versions less than 8. The bindings that use completable futures have been renamed to ``fdb-java``. <rdar://problem/35029630>
|
||||
* Finalizers now log a warning to stderr if an object with native resources is not closed. This can be disabled by calling ``FDB.setUnclosedWarning()``. <rdar://problem/35421530>
|
||||
* Implementers of the ``Disposable`` interface now implement ``AutoCloseable`` instead, with ``close()`` replacing ``dispose()``.
|
||||
* ``AutoCloseable`` objects will continue to be closed in object finalizers, but this behavior is being deprecated. All ``AutoCloseable`` objects should be explicitly closed. <rdar://problem/35421530>
|
||||
* ``AsyncIterator`` is no longer closeable. <rdar://problem/35595971>
|
||||
* ``getBoundaryKeys()`` now returns a ``CloseableAsyncIterable`` rather than an ``AsyncIterator``. <rdar://problem/35421530>
|
||||
* ``Transaction.getRange()`` no longer initiates a range read immediately. Instead, the read is issued by a call to ``AsyncIterable.asList()`` or ``AsyncIterable.iterator()``. <rdar://problem/35595971>
|
||||
* Added ``hashCode()`` method to ``Subspace``. <rdar://problem/35125601>
|
||||
* Added thread names to threads created by our default executor. <rdar://problem/36077166>
|
||||
* The network thread by default will be named ``fdb-network-thread``. <rdar://problem/36077166>
|
||||
* Added an overload of ``whileTrue()`` which takes a ``Supplier``. <rdar://problem/35096338>
|
||||
* Added experimental support for enabling native callbacks from external threads. <rdar://problem/33300740>
|
||||
* Fix: Converting the result of ``Transaction.getRange()`` to a list would issue an unneeded range read. <rdar://problem/35325444>
|
||||
* Fix: range iterators failed to close underlying native resources. <rdar://problem/35595971>
|
||||
* Fix: various objects internal to the bindings were not properly closed. <rdar://problem/35541447>
|
||||
|
||||
Other Changes
|
||||
-------------
|
||||
|
||||
* Backups made prior to 5.1 can no longer be restored. <rdar://problem/25512772>
|
||||
* Backup now uses a hostname in the connection string instead of a list of IPs when backing up to blob storage. This hostname is resolved using DNS. <rdar://problem/34093405>
|
||||
* ``fdbblob`` functionality has been moved to ``fdbbackup``. <rdar://problem/25512772>
|
||||
* ``fdbcli`` will warn the user if it is used to connect to an incompatible cluster. <rdar://problem/33363571>
|
||||
* Cluster files that do not match the current connection string are no longer corrected automatically. <rdar://problem/35129575>
|
||||
* Improved computation of available memory on pre-3.14 kernels. <rdar://problem/35336487>
|
||||
* Stopped reporting blob storage connection credentials in ``fdbbackup`` status output. <rdar://problem/31483629>
|
||||
|
||||
Earlier release notes
|
||||
---------------------
|
||||
* :doc:`5.0 (API Version 500) </old-release-notes/release-notes-500>`
|
||||
* :doc:`4.6 (API Version 460) </old-release-notes/release-notes-460>`
|
||||
* :doc:`4.5 (API Version 450) </old-release-notes/release-notes-450>`
|
||||
* :doc:`4.4 (API Version 440) </old-release-notes/release-notes-440>`
|
||||
* :doc:`4.3 (API Version 430) </old-release-notes/release-notes-430>`
|
||||
* :doc:`4.2 (API Version 420) </old-release-notes/release-notes-420>`
|
||||
* :doc:`4.1 (API Version 410) </old-release-notes/release-notes-410>`
|
||||
* :doc:`4.0 (API Version 400) </old-release-notes/release-notes-400>`
|
||||
* :doc:`3.0 (API Version 300) </old-release-notes/release-notes-300>`
|
||||
* :doc:`2.0 (API Version 200) </old-release-notes/release-notes-200>`
|
||||
* :doc:`1.0 (API Version 100) </old-release-notes/release-notes-100>`
|
||||
* :doc:`Beta 3 (API Version 23) </old-release-notes/release-notes-023>`
|
||||
* :doc:`Beta 2 (API Version 22) </old-release-notes/release-notes-022>`
|
||||
* :doc:`Beta 1 (API Version 21) </old-release-notes/release-notes-021>`
|
||||
* :doc:`Alpha 6 (API Version 16) </old-release-notes/release-notes-016>`
|
||||
* :doc:`Alpha 5 (API Version 14) </old-release-notes/release-notes-014>`
|
|
@ -0,0 +1,28 @@
|
|||
###########
|
||||
Scalability
|
||||
###########
|
||||
|
||||
Scalability is widely recognized as an essential property for successful applications. Scalability is actually one of *three* properties that closely interact to shape a system's performance profile:
|
||||
|
||||
* High performance: the ability to achieve the highest performance levels in a given configuration;
|
||||
* Scalability: the ability to efficiently deliver service at very different scales;
|
||||
* Elasticity: the ability to adapt up and down in scale quickly.
|
||||
|
||||
The interaction between high performance and scalability, in particular, is often not understood. An ant colony moving dirt is scalable but not highly performant. A single bulldozer moving dirt is highly performant but not scalable.
|
||||
|
||||
All three properties are important to your business. High performance means that you won't need to redesign your architecture every time your traffic doubles in size. Scalability means that your expenses start out very small and grow with your business. Elasticity means that you can gracefully scale up and down on a continuous basis in response to demand.
|
||||
|
||||
FoundationDB is Highly Performant
|
||||
=================================
|
||||
|
||||
FoundationDB was built to optimize a range of critical performance metrics. This approach is an important differentiator among distributed databases, many of which optimize for the simplicity of their own product development effort over the performance of their product. At every level of our system, we evaluate potential designs for their real-world efficiency. We build our own benchmarks of CPUs, memory controllers, disks, networks, and SSDs. We perform modeling and simulation and change designs to maximize performance, even at the expense of the simplicity of our development. When we consider high performance, we don't just look at the theoretical scalability of various algorithms; we target and achieve :doc:`real-world numbers <performance>`: millions of operations per second.
|
||||
|
||||
FoundationDB is Scalable
|
||||
========================
|
||||
|
||||
FoundationDB offers scalability from partial utilization of a single core on a single machine to full utilization of dozens of powerful multicore machines in a cluster.
|
||||
|
||||
FoundationDB is Elastic
|
||||
=======================
|
||||
|
||||
FoundationDB allows hardware to be provisioned and deprovisioned on-the-fly in response to changing needs without interruption or degradation of service. As data is written to the database, each piece of data is automatically placed on several independent computers. This replication allows immediate load balancing, and data is automatically moved from computer to computer to balance load over a longer time period. Based on request load and data size, FoundationDB seamlessly redistributes data across its distributed servers. FoundationDB is completely elastic, responding within milliseconds to hot spots and within minutes to major changes in usage.
|
|
@ -0,0 +1,87 @@
|
|||
#####################
|
||||
Segmented Range Reads
|
||||
#####################
|
||||
|
||||
:doc:`Python <simple-indexes>` **Java**
|
||||
|
||||
Goal
|
||||
====
|
||||
|
||||
Perform range reads in calibrated batches.
|
||||
|
||||
Challenge
|
||||
=========
|
||||
|
||||
Retrieve data in batches whose size you select based on your data model or application.
|
||||
|
||||
Explanation
|
||||
===========
|
||||
|
||||
FoundationDB supports streaming modes that makes range reads efficient even for large amounts of data. You can usually get good performance by selecting the proper streaming mode. However, there are particular cases in which you may want to exercise finer grained control of data retrieval. You can exercise this control using the limit parameter.
|
||||
|
||||
Ordering
|
||||
========
|
||||
|
||||
This approach works with arbitrary ranges, which are, by definition, ordered. The goal here is to be able to walk through sub-ranges in order.
|
||||
|
||||
Pattern
|
||||
=======
|
||||
|
||||
A range read returns a container that issues asynchronous reads to the database. The client usually processes the data by iterating over the values returned by the container. The API balances latency and bandwidth by fetching data in batches as determined by the ``streaming_mode`` parameter. Streaming modes allow you to customize this balance based on how you intend to consume the data. The default streaming mode is quite efficient. However, if you anticipate that your range read will retrieve a large amount of data, you should select a streaming mode to match your use case. For example, if you're iterating through a large range and testing against a condition that may result in early termination, you can use the ``small`` streaming mode:
|
||||
|
||||
.. code-block:: java
|
||||
|
||||
for(KeyValue kv : tr.getRange(r, ReadTransaction.ROW_LIMIT_UNLIMITED, false, StreamingMode.SMALL)){
|
||||
if(haltingCondition(kv.getKey(), kv.getValue())){
|
||||
break;
|
||||
}
|
||||
System.out.println(Tuple.fromBytes(kv.getKey()).toString()
|
||||
+ " " + Tuple.fromBytes(kv.getValue()).toString());
|
||||
}
|
||||
|
||||
However, in some situations, you may want to explicitly control the number of key-value pairs returned. This may be the case if your data model creates blocks of N key-value pairs, and you want to read M blocks at a time and therefore a sub-range of N x M key-value pairs. You can use the limit parameter for this purpose.
|
||||
|
||||
Extensions
|
||||
==========
|
||||
|
||||
*Parallel retrieval*
|
||||
|
||||
For very large range reads, you can use multiple clients to perform reads in parallel. In this case, you'll want to estimate sub-ranges of roughly equal size based on the distribution of your keys. The :ref:`locality <api-python-locality>` functions can be used to find the partition boundaries used by the database, which will be roughly uniformly distributed in bytes of data. The partition boundaries can then be used to derive boundaries between sub-ranges for parallel reading.
|
||||
|
||||
Code
|
||||
====
|
||||
|
||||
Here’s a basic function that successively reads sub-ranges of a size determined by the value of ``LIMIT``.
|
||||
|
||||
.. code-block:: java
|
||||
|
||||
public static void getRangeLimited(TransactionContext tcx, final KeySelector begin, final KeySelector end){
|
||||
tcx.run(new Function<Transaction,Void>() {
|
||||
public Void apply(Transaction tr){
|
||||
boolean keysToCheck = true;
|
||||
ArrayList<Tuple> keysFound = new ArrayList<Tuple>();
|
||||
KeySelector n_begin = new KeySelector(begin.getKey(),true,begin.getOffset());
|
||||
while(keysToCheck){
|
||||
keysToCheck = false;
|
||||
for(KeyValue kv : tr.getRange(n_begin, end, LIMIT)){
|
||||
keysToCheck = true;
|
||||
Tuple t = Tuple.fromBytes(kv.getKey());
|
||||
if(keysFound.size() == 0
|
||||
|| !t.equals(keysFound.get(keysFound.size()-1))){
|
||||
keysFound.add(t);
|
||||
}
|
||||
}
|
||||
if(keysToCheck){
|
||||
n_begin = KeySelector.firstGreaterThan(keysFound.get(keysFound.size()-1).pack());
|
||||
ArrayList<Object> readableFound = new ArrayList<Object>();
|
||||
for(Tuple t : keysFound){
|
||||
readableFound.add(t.get(1));
|
||||
}
|
||||
System.out.println(readableFound);
|
||||
keysFound = new ArrayList<Tuple>();
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
});
|
||||
}
|
|
@ -0,0 +1,60 @@
|
|||
#####################
|
||||
Segmented Range Reads
|
||||
#####################
|
||||
|
||||
**Python** :doc:`Java <segmented-range-reads-java>`
|
||||
|
||||
Goal
|
||||
====
|
||||
|
||||
Perform range reads in calibrated batches.
|
||||
|
||||
Challenge
|
||||
=========
|
||||
|
||||
Retrieve data in batches whose size you select based on your data model or application.
|
||||
|
||||
Explanation
|
||||
===========
|
||||
|
||||
FoundationDB supports streaming modes that makes range reads efficient even for large amounts of data. You can usually get good performance by selecting the proper streaming mode. However, there are particular cases in which you may want to exercise finer grained control of data retrieval. You can exercise this control using the limit parameter.
|
||||
|
||||
Ordering
|
||||
========
|
||||
|
||||
This approach works with arbitrary ranges, which are, by definition, ordered. The goal here is to be able to walk through sub-ranges in order.
|
||||
|
||||
Pattern
|
||||
=======
|
||||
|
||||
A range read returns a container that issues asynchronous reads to the database. The client usually processes the data by iterating over the values returned by the container. The API balances latency and bandwidth by fetching data in batches as determined by the ``streaming_mode`` parameter. Streaming modes allow you to customize this balance based on how you intend to consume the data. The default streaming mode (iterator) is quite efficient. However, if you anticipate that your range read will retrieve a large amount of data, you should select a streaming mode to match your use case. For example, if you're iterating through a large range and testing against a condition that may result in early termination, you can use the ``small`` streaming mode::
|
||||
|
||||
for k, v in tr.get_range('a', 'c', streaming_mode=fdb.StreamingMode.small):
|
||||
if halting_condition(k, v): break
|
||||
print(k,v)
|
||||
|
||||
However, in some situations, you may want to explicitly control the number of key-value pairs returned. This may be the case if your data model creates blocks of N key-value pairs, and you want to read M blocks at a time and therefore a sub-range of N x M key-value pairs. You can use the limit parameter for this purpose.
|
||||
|
||||
Extensions
|
||||
==========
|
||||
|
||||
*Parallel retrieval*
|
||||
|
||||
For very large range reads, you can use multiple clients to perform reads in parallel. In this case, you'll want to estimate sub-ranges of roughly equal size based on the distribution of your keys. The :ref:`locality <api-python-locality>` functions can be used to find the partition boundaries used by the database, which will be roughly uniformly distributed in bytes of data. The partition boundaries can then be used to derive boundaries between sub-ranges for parallel reading.
|
||||
|
||||
Code
|
||||
====
|
||||
|
||||
Here’s a basic function that successively reads sub-ranges of a size determined by the value of ``LIMIT``.
|
||||
::
|
||||
|
||||
@fdb.transactional
|
||||
def get_range_limited(tr, begin, end):
|
||||
keys_found = True
|
||||
while keys_found:
|
||||
keys_found = []
|
||||
for k, v in tr.get_range(begin, end, limit=LIMIT):
|
||||
keys_found.append(k)
|
||||
if keys_found:
|
||||
begin = fdb.KeySelector.first_greater_than(keys_found[-1])
|
||||
yield keys_found
|
|
@ -0,0 +1,135 @@
|
|||
##############
|
||||
Simple Indexes
|
||||
##############
|
||||
|
||||
:doc:`Python <simple-indexes>` **Java**
|
||||
|
||||
Goal
|
||||
====
|
||||
|
||||
Add (one or more) indexes to allow efficient retrieval of data in multiple ways.
|
||||
|
||||
Challenges
|
||||
==========
|
||||
|
||||
There are two big challenges with building indexes in a key-value store: 1) Storing the index so that multiple data elements “matching” the index read are returned as efficiently as possible, and 2) Keeping the indexes in sync with the data with concurrent readers and writers.
|
||||
|
||||
Strategy
|
||||
========
|
||||
|
||||
By using the key ordering of FoundationDB, we can store indexes so that an index query can return multiple matches using a single efficient range read operation. By updating the data element and all of its associated indexes together within a single ACID transaction we can guarantee that the data and indexes stay in sync.
|
||||
|
||||
Pattern
|
||||
=======
|
||||
|
||||
Let's say the primary copy of the data is stored with key-value pairs where the key has a tuple-structure consisting of a subspace and an ID:
|
||||
|
||||
.. code-block:: java
|
||||
|
||||
// (main_subspace, ID) = value
|
||||
|
||||
This structure lets you lookup an “ID” easily and get its associated value. But, let’s say part of the value is a zipcode. You might be interested in all IDs that have a zipcode of 22182. You could answer that question, but it would require scanning every single ID. What we need to improve the efficiency is an “index on zipcode”.
|
||||
|
||||
An index is essentially another representation of the data, designed to be looked up in a different way:
|
||||
|
||||
.. code-block:: java
|
||||
|
||||
// (index_subspace, zipcode, ID) = ''
|
||||
|
||||
To make the index, you store both the zipcode and the ID as parts of the key, but don’t store the whole value again. You also put the index in its own subspace to keep it separate from the primary data.
|
||||
|
||||
Now, to answer the question of what IDs match zipcode 22182, you can now restrict the search to all tuples matching ``(index_subspace, 22182, *)``. Happily, because of the way that ordered tuples get packed into ordered keys, all of the tuples matching this pattern can be retrieved using a single range-read operation on the database. This makes index queries blazing fast--requiring one database operation instead of a scan of the entire dataset.
|
||||
|
||||
You can use the pattern above in any ordered key-value store. But, as anyone who has tried it will tell you, the trick is dealing with maintaining these indexes during concurrent reads and writes. In most distributed databases, this is a nightmare of race conditions and extra logic to deal with the fact that, while the data and the indexes both get updated, they do not necessarily do so at the same time.
|
||||
|
||||
By contrast, FoundationDB’s ACID transactions completely handle the difficult concurrency problem automatically. This is accomplished by simply updating the data and the indexes in the same transaction. A good approach is to implement a transactional setter function that does nothing but perform a logical write to both the data record and its indexes. This approach keeps your code clean and makes it easier to add further indexes in the future.
|
||||
|
||||
Extensions
|
||||
==========
|
||||
|
||||
Additional indexes
|
||||
------------------
|
||||
|
||||
Of course, you can maintain as many indexes as you need. You are trading off write performance (and a bit of capacity usage) to speed up read performance. In general, you usually add indexes to support all of the access patterns that you actually use. For example, if we need fast access by both the "X" and "Y" properties, we could maintain three data representations (the main data plus two indexes):
|
||||
|
||||
.. code-block:: java
|
||||
|
||||
// (main_subspace, ID) = value
|
||||
// (index_x, X, ID) = ''
|
||||
// (index_y, Y, ID) = ''
|
||||
|
||||
Covering indexes
|
||||
----------------
|
||||
|
||||
In the above examples, the index gives you an entity ID or primary key with which the rest of the record can be retrieved. Sometimes might you want to retrieve the entire record from an index with a single read. In this case, you can store all data components in the key, possibly including the value.
|
||||
|
||||
.. code-block:: java
|
||||
|
||||
// (main_subspace, ID) = value
|
||||
// (index_subspace, X, ID) = value
|
||||
|
||||
The obvious tradeoff is that you are storing another entire copy of the value.
|
||||
|
||||
Code
|
||||
====
|
||||
|
||||
In this example, we’re storing user data based on user ID but sometimes need to retrieve users based on their zipcode. We use a transactional function to set user data and its index and another to retrieve data using the index.
|
||||
|
||||
.. code-block:: java
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
||||
public class MicroIndexes {
|
||||
|
||||
private static final FDB fdb;
|
||||
private static final Database db;
|
||||
private static final Subspace main;
|
||||
private static final Subspace index;
|
||||
|
||||
static {
|
||||
fdb = FDB.selectAPIVersion(510);
|
||||
db = fdb.open();
|
||||
main = new Subspace(Tuple.from("user"));
|
||||
index = new Subspace(Tuple.from("zipcode_index"));
|
||||
}
|
||||
|
||||
// TODO These three methods (setUser, getUser, and getUserIDsInRegion)
|
||||
// are all in the recipe book.
|
||||
public static void setUser(TransactionContext tcx, final String ID, final String name, final String zipcode){
|
||||
tcx.run(new Function<Transaction,Void>() {
|
||||
public Void apply(Transaction tr){
|
||||
tr.set(main.pack(Tuple.from(ID,zipcode)), Tuple.from(name).pack());
|
||||
tr.set(index.pack(Tuple.from(zipcode,ID)), Tuple.from().pack());
|
||||
return null;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// Normal lookup.
|
||||
public static String getUser(TransactionContext tcx, final String ID){
|
||||
return tcx.run(new Function<Transaction,String>() {
|
||||
public String apply(Transaction tr){
|
||||
for(KeyValue kv : tr.getRange(main.subspace(Tuple.from(ID)).range(), 1)){
|
||||
// Return user with correct ID (if exists).
|
||||
return Tuple.fromBytes(kv.getValue()).getString(0);
|
||||
}
|
||||
return "";
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// Index lookup.
|
||||
public static ArrayList<String> getUserIDsInRegion(TransactionContext tcx, final String zipcode){
|
||||
return tcx.run(new Function<Transaction,ArrayList<String>>() {
|
||||
public ArrayList<String> apply(Transaction tr){
|
||||
ArrayList<String> IDs = new ArrayList<String>();
|
||||
for(KeyValue kv : tr.getRange(index.subspace(Tuple.from(zipcode)).range())){
|
||||
IDs.add(index.unpack(kv.getKey()).getString(1));
|
||||
}
|
||||
return IDs;
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
That's just about all you need to create an index.
|
|
@ -0,0 +1,90 @@
|
|||
##############
|
||||
Simple Indexes
|
||||
##############
|
||||
|
||||
**Python** :doc:`Java <simple-indexes-java>`
|
||||
|
||||
Goal
|
||||
====
|
||||
|
||||
Add (one or more) indexes to allow efficient retrieval of data in multiple ways.
|
||||
|
||||
Challenges
|
||||
==========
|
||||
|
||||
There are two big challenges with building indexes in a key-value store: 1) Storing the index so that multiple data elements “matching” the index read are returned as efficiently as possible, and 2) Keeping the indexes in sync with the data with concurrent readers and writers.
|
||||
|
||||
Strategy
|
||||
========
|
||||
|
||||
By using the key ordering of FoundationDB, we can store indexes so that an index query can return multiple matches using a single efficient range read operation. By updating the data element and all of its associated indexes together within a single ACID transaction we can guarantee that the data and indexes stay in sync.
|
||||
|
||||
Pattern
|
||||
=======
|
||||
|
||||
Let's say the primary copy of the data is stored with key-value pairs where the key has a tuple-structure consisting of a subspace and an ID::
|
||||
|
||||
(main_subspace, ID) = value
|
||||
|
||||
This structure lets you lookup an “ID” easily and get its associated value. But, let’s say part of the value is a zipcode. You might be interested in all IDs that have a zipcode of 22182. You could answer that question, but it would require scanning every single ID. What we need to improve the efficiency is an “index on zipcode”.
|
||||
|
||||
An index is essentially another representation of the data, designed to be looked up in a different way::
|
||||
|
||||
(index_subspace, zipcode, ID) = ''
|
||||
|
||||
To make the index, you store both the zipcode and the ID as parts of the key, but don’t store the whole value again. You also put the index in its own subspace to keep it separate from the primary data.
|
||||
|
||||
Now, to answer the question of what IDs match zipcode 22182, you can now restrict the search to all tuples matching ``(index_subspace, 22182, *)``. Happily, because of the way that ordered tuples get packed into ordered keys, all of the tuples matching this pattern can be retrieved using a single range-read operation on the database. This makes index queries blazing fast--requiring one database operation instead of a scan of the entire dataset.
|
||||
|
||||
You can use the pattern above in any ordered key-value store. But, as anyone who has tried it will tell you, the trick is dealing with maintaining these indexes during concurrent reads and writes. In most distributed databases, this is a nightmare of race conditions and extra logic to deal with the fact that, while the data and the indexes both get updated, they do not necessarily do so at the same time.
|
||||
|
||||
By contrast, FoundationDB’s ACID transactions completely handle the difficult concurrency problem automatically. This is accomplished by simply updating the data and the indexes in the same transaction. A good approach is to implement a transactional setter function that does nothing but perform a logical write to both the data record and its indexes. This approach keeps your code clean and makes it easier to add further indexes in the future.
|
||||
|
||||
Extensions
|
||||
==========
|
||||
|
||||
Additional indexes
|
||||
------------------
|
||||
|
||||
Of course, you can maintain as many indexes as you need. You are trading off write performance (and a bit of capacity usage) to speed up read performance. In general, you usually add indexes to support all of the access patterns that you actually use. For example, if we need fast access by both the "X" and "Y" properties, we could maintain three data representations (the main data plus two indexes)::
|
||||
|
||||
(main_subspace, ID) = value
|
||||
(index_x, X, ID) = ''
|
||||
(index_y, Y, ID) = ''
|
||||
|
||||
Covering indexes
|
||||
----------------
|
||||
|
||||
In the above examples, the index gives you an entity ID or primary key with which the rest of the record can be retrieved. Sometimes might you want to retrieve the entire record from an index with a single read. In this case, you can store all data components in the key, possibly including the value.
|
||||
::
|
||||
|
||||
(main_subspace, ID) = value
|
||||
(index_subspace, X, ID) = value
|
||||
|
||||
The obvious tradeoff is that you are storing another entire copy of the value.
|
||||
|
||||
Code
|
||||
====
|
||||
|
||||
In this example, we’re storing user data based on user_ID but sometimes need to retrieve users based on their zipcode. We use a transactional function to set user data and its index and another to retrieve data using the index.
|
||||
::
|
||||
|
||||
user = Subspace(('user',))
|
||||
index = Subspace(('zipcode_index',))
|
||||
|
||||
@fdb.transactional
|
||||
def set_user(tr, ID, name, zipcode):
|
||||
tr[user[ID][zipcode]] = name
|
||||
tr[index[zipcode][ID]] = ''
|
||||
|
||||
# Normal lookup
|
||||
@fdb.transactional
|
||||
def get_user(tr, ID):
|
||||
return tr[user[ID]]
|
||||
|
||||
# Index lookup
|
||||
@fdb.transactional
|
||||
def get_user_IDs_in_region(tr, region):
|
||||
return [index.unpack(k)[1] for k, _ in tr[index[region].range()]]
|
||||
|
||||
That's just about all you need to create an index.
|
|
@ -0,0 +1,87 @@
|
|||
################
|
||||
Spatial Indexing
|
||||
################
|
||||
|
||||
:doc:`Python <spatial-indexing>` **Java**
|
||||
|
||||
Goal
|
||||
====
|
||||
|
||||
Create a spatial index for the database.
|
||||
|
||||
Challenge
|
||||
=========
|
||||
|
||||
For a data set of labeled points in two-dimensional (2D) space, support the efficient retrieval of all points within an axis-aligned rectangular region.
|
||||
|
||||
Explanation
|
||||
===========
|
||||
|
||||
To achieve good performance, you encode each point as a binary key whose structure allows queries to efficiently find points in a specified region. The encoding uses a `Z-order curve <http://en.wikipedia.org/wiki/Z-order_curve>`_ that maps 2D data to one-dimensional (1D) keys suitable for an ordered key-value store.
|
||||
|
||||
Ordering
|
||||
========
|
||||
|
||||
FoundationDB stores binary keys in their natural order. Z-order curves map 2D points to binary keys in a manner that preserves proximity, in the sense that nearby points end up in nearby binary keys. This property allows range-based queries to be `efficiently computed from the keys <http://en.wikipedia.org/wiki/Z-order_curve#Use_with_one-dimensional_data_structures_for_range_searching>`_.
|
||||
|
||||
Pattern
|
||||
=======
|
||||
|
||||
The indexer conceptually traverses the 2D space in a systematic order, tracing out a 1D curve. Data points are mapped to their position along the curve as they’re encountered. Each step in the traversal is encoded as a bit indicating its direction.
|
||||
|
||||
The spatial index exploits the proximity preservation of Z-order curves to store spatial data in the ordered key-value store and support spatial queries.
|
||||
|
||||
Given a point p represented as pair of coordinates (x, y), the Z-order curve lets us encode p as a binary key z. In other words, you have a pair of functions:
|
||||
|
||||
.. code-block:: java
|
||||
|
||||
public long xyToZ(long[] p){
|
||||
long x,y,z;
|
||||
x = p[0]; y = p[1];
|
||||
// Encode (x,y) as a binary key z.
|
||||
return z;
|
||||
}
|
||||
|
||||
public long[] zToXy(long z){
|
||||
long[] p = new long[2];
|
||||
long x, y;
|
||||
// Decode z to a pair of coordinates (x,y).
|
||||
p[0] = x; p[1] = y;
|
||||
return p;
|
||||
}
|
||||
|
||||
The spatial index will use a pair of subspaces: one, ``z_label``, to give us efficient access to labels by point; the other, ``label_z``, to give us efficient access to points by label. Storing two access paths for each item is an example of an inverse index (see the pattern for Simple Indexes). You set both parts of the index in a single transactional function, as follows:
|
||||
|
||||
.. code-block:: java
|
||||
|
||||
public void setLocation(TransactionContext tcx, final String label, final long[] pos){
|
||||
tcx.run(new Function<Transaction,Void>() {
|
||||
public Void apply(Transaction tr){
|
||||
long z = xyToZ(pos);
|
||||
long previous;
|
||||
// Read labelZ.subspace(Tuple.from(label)) to find previous z.
|
||||
if(/* there is a previous z */){
|
||||
tr.clear(labelZ.pack(Tuple.from(label,previous)));
|
||||
tr.clear(zLabel.pack(Tuple.from(previous,label)));
|
||||
}
|
||||
tr.set(labelZ.pack(Tuple.from(label,z)),Tuple.from().pack());
|
||||
tr.set(zLabel.pack(Tuple.from(z,label)),Tuple.from().pack());
|
||||
return null;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
This representation gives the building blocks you need to efficiently find all the points in a given rectangle.
|
||||
|
||||
Extensions
|
||||
==========
|
||||
|
||||
Higher dimensions
|
||||
-----------------
|
||||
|
||||
Z-order curves can be straightforwardly applied to points in three or more dimensions. As in two dimensions, each point will be mapped to a binary key determined by its position along the curve.
|
||||
|
||||
Richer non-spatial data
|
||||
-----------------------
|
||||
|
||||
We've assumed that the labels of our data items are strings or a similar primitive data type, but you can easily extend the technique to richer data records in which the spatial coordinates are one component among several.
|
|
@ -0,0 +1,70 @@
|
|||
################
|
||||
Spatial Indexing
|
||||
################
|
||||
|
||||
**Python** :doc:`Java <spatial-indexing-java>`
|
||||
|
||||
Goal
|
||||
====
|
||||
|
||||
Create a spatial index for the database.
|
||||
|
||||
Challenge
|
||||
=========
|
||||
|
||||
For a data set of labeled points in two-dimensional (2D) space, support the efficient retrieval of all points within an axis-aligned rectangular region.
|
||||
|
||||
Explanation
|
||||
===========
|
||||
|
||||
To achieve good performance, you encode each point as a binary key whose structure allows queries to efficiently find points in a specified region. The encoding uses a `Z-order curve <http://en.wikipedia.org/wiki/Z-order_curve>`_ that maps 2D data to one-dimensional (1D) keys suitable for an ordered key-value store.
|
||||
|
||||
Ordering
|
||||
========
|
||||
|
||||
FoundationDB stores binary keys in their natural order. Z-order curves map 2D points to binary keys in a manner that preserves proximity, in the sense that nearby points end up in nearby binary keys. This property allows range-based queries to be `efficiently computed from the keys <http://en.wikipedia.org/wiki/Z-order_curve#Use_with_one-dimensional_data_structures_for_range_searching>`_.
|
||||
|
||||
Pattern
|
||||
=======
|
||||
|
||||
The indexer conceptually traverses the 2D space in a systematic order, tracing out a 1D curve. Data points are mapped to their position along the curve as they’re encountered. Each step in the traversal is encoded as a bit indicating its direction.
|
||||
|
||||
The spatial index exploits the proximity preservation of Z-order curves to store spatial data in the ordered key-value store and support spatial queries.
|
||||
|
||||
Given a point p represented as pair of coordinates (x, y), the Z-order curve lets us encode p as a binary key z. In other words, you have a pair of functions::
|
||||
|
||||
def xy_to_z(p):
|
||||
(x, y) = p
|
||||
encode (x, y) as a binary key z
|
||||
return z
|
||||
|
||||
def z_to_xy(z):
|
||||
decode z to a pair of coordinates (x, y)
|
||||
return (x, y)
|
||||
|
||||
The spatial index will use a pair of subspaces: one, ``z_label``, to give us efficient access to labels by point; the other, ``label_z``, to give us efficient access to points by label. Storing two access paths for each item is an example of an inverse index (see the pattern for Simple Indexes). You set both parts of the index in a single transactional function, as follows::
|
||||
|
||||
@fdb.transactional
|
||||
def set_location(tr, label, pos):
|
||||
z = xy_to_z(pos)
|
||||
read label_z[label] to find previous z
|
||||
if there is an previous z:
|
||||
delete label_z[label][previous]
|
||||
delete z_label[previous][label]
|
||||
write label_z[label][z]
|
||||
write z_label[z][label]
|
||||
|
||||
This representation gives the building blocks you need to efficiently find all the points in a given rectangle.
|
||||
|
||||
Extensions
|
||||
==========
|
||||
|
||||
Higher dimensions
|
||||
-----------------
|
||||
|
||||
Z-order curves can be straightforwardly applied to points in three or more dimensions. As in two dimensions, each point will be mapped to a binary key determined by its position along the curve.
|
||||
|
||||
Richer non-spatial data
|
||||
-----------------------
|
||||
|
||||
We've assumed that the labels of our data items are strings or a similar primitive data type, but you can easily extend the technique to richer data records in which the spatial coordinates are one component among several.
|
|
@ -0,0 +1,102 @@
|
|||
####################
|
||||
Subspace Indirection
|
||||
####################
|
||||
|
||||
:doc:`Python <subspace-indirection>` **Java**
|
||||
|
||||
Goal
|
||||
====
|
||||
|
||||
Employ subspace indirection to manage bulk inserts or similar long-running operations.
|
||||
|
||||
Challenge
|
||||
=========
|
||||
|
||||
Some large or long operations, such as bulk inserts, cannot be handled in a single FoundationDB transaction because the database does not support transactions over five seconds.
|
||||
|
||||
Explanation
|
||||
===========
|
||||
|
||||
We can handle long-running operations using a distinct subspace to hold a temporary copy of the data. The new subspace is transactionally moved into place upon completion of the writes. We can perform the move quickly because the client accesses the subspaces through managed references.
|
||||
|
||||
FoundationDB :ref:`directories <developer-guide-directories>` provide a convenient method to indirectly reference subspaces. Each directory is identified by a path that is mapped to a prefix for a subspace. The indirection from paths to subspaces makes it fast to move directories by renaming their paths.
|
||||
|
||||
Ordering
|
||||
========
|
||||
|
||||
The ordering of keys applies within each directory subspace for whatever data model is used by the application. However, directory subspaces are independent of each other, and there is no meaningful ordering between them.
|
||||
|
||||
Pattern
|
||||
=======
|
||||
|
||||
For a single client, we can use a simple Workspace class to handle creation of a new subspace and transactionally swapping it into place. Rather than working with a subspace directly, a client accesses the current subspace through a managed reference as follows:
|
||||
|
||||
.. code-block:: java
|
||||
|
||||
Database db = fdb.open();
|
||||
Future<DirectorySubspace> workingDir = DirectoryLayer.getDefault().createOrOpen(db, PathUtil.from("working"));
|
||||
Workspace workspace = new Workspace(workingDir.get(), db);
|
||||
final DirectorySubspace current = workspace.getCurrent().get();
|
||||
|
||||
The client performs transactions on data in the subspace current in the usual manner. When we want a new workspace for a bulk load or other long-running operation, we create one with a workspace:
|
||||
|
||||
.. code-block:: java
|
||||
|
||||
final DirectorySubspace newspace = workspace.getNew().get();
|
||||
try {
|
||||
clearSubspace(db, newspace);
|
||||
db.run(new Function<Transaction,Void>() {
|
||||
public Void apply(Transaction tr){
|
||||
tr.set(newspace.pack(Tuple.from(3)),Tuple.from("c").pack());
|
||||
tr.set(newspace.pack(Tuple.from(4)), Tuple.from("d").pack());
|
||||
return null;
|
||||
}
|
||||
});
|
||||
} finally {
|
||||
// Asynchronous operation--wait until result is reached.
|
||||
workspace.replaceWithNew().blockUntilReady();
|
||||
}
|
||||
|
||||
When the workspace completes, it transactionally replaces the current subspace with the new one.
|
||||
|
||||
Extensions
|
||||
==========
|
||||
|
||||
*Multiple Clients*
|
||||
|
||||
Beyond the ability to load and transactionally swap in a single new data set, an application may want to support multiple clients concurrently performing long-running operations on a data set. In this case, the application could perform optimistic validation of an operation before accepting it.
|
||||
|
||||
Code
|
||||
====
|
||||
|
||||
Here's a simple Workspace class for swapping in a new workspace supporting the basic usage above.
|
||||
|
||||
.. code-block:: java
|
||||
|
||||
public static class Workspace {
|
||||
private final Database db;
|
||||
private final DirectorySubspace dir;
|
||||
public Workspace(DirectorySubspace directory, Database db){
|
||||
this.dir = directory;
|
||||
this.db = db;
|
||||
}
|
||||
public Future<DirectorySubspace> getCurrent() {
|
||||
return dir.createOrOpen(this.db, PathUtil.from("current"));
|
||||
}
|
||||
public Future<DirectorySubspace> getNew() {
|
||||
return dir.createOrOpen(this.db, PathUtil.from("new"));
|
||||
}
|
||||
public Future<DirectorySubspace> replaceWithNew() {
|
||||
return this.db.runAsync(new Function<Transaction,Future<DirectorySubspace>>() {
|
||||
public Future<DirectorySubspace> apply(final Transaction tr){
|
||||
return dir.remove(tr, PathUtil.from("current")) // Clear the old current.
|
||||
.flatMap(new Function<Void,Future<DirectorySubspace>>() {
|
||||
public Future<DirectorySubspace> apply(Void arg0) {
|
||||
// Replace the old directory with the new one.
|
||||
return dir.move(tr, PathUtil.from("new"), PathUtil.from("current"));
|
||||
}
|
||||
});
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
|
@ -0,0 +1,84 @@
|
|||
####################
|
||||
Subspace Indirection
|
||||
####################
|
||||
|
||||
**Python** :doc:`Java <subspace-indirection-java>`
|
||||
|
||||
Goal
|
||||
====
|
||||
|
||||
Employ subspace indirection to manage bulk inserts or similar long-running operations.
|
||||
|
||||
Challenge
|
||||
=========
|
||||
|
||||
Some large or long operations, such as bulk inserts, cannot be handled in a single FoundationDB transaction because the database does not support transactions over five seconds.
|
||||
|
||||
Explanation
|
||||
===========
|
||||
|
||||
We can handle long-running operations using a distinct subspace to hold a temporary copy of the data. The new subspace is transactionally moved into place upon completion of the writes. We can perform the move quickly because the client accesses the subspaces through managed references.
|
||||
|
||||
FoundationDB :ref:`directories <developer-guide-directories>` provide a convenient method to indirectly reference subspaces. Each directory is identified by a path that is mapped to a prefix for a subspace. The indirection from paths to subspaces makes it fast to move directories by renaming their paths.
|
||||
|
||||
Ordering
|
||||
========
|
||||
|
||||
The ordering of keys applies within each directory subspace for whatever data model is used by the application. However, directory subspaces are independent of each other, and there is no meaningful ordering between them.
|
||||
|
||||
Pattern
|
||||
=======
|
||||
|
||||
For a single client, we can use a simple context manager to handle creation of a new subspace and transactionally swapping it into place. Rather than working with a subspace directly, a client accesses the current subspace through a managed reference as follows::
|
||||
|
||||
db = fdb.open()
|
||||
|
||||
working_dir = fdb.directory.create_or_open(db, (u'working',))
|
||||
workspace = Workspace(working_dir, db)
|
||||
current = workspace.current
|
||||
|
||||
The client performs transactions on data in the subspace current in the usual manner. When we want a new workspace for a bulk load or other long-running operation, we create one with a context manager::
|
||||
|
||||
with workspace as newspace:
|
||||
# . . .
|
||||
# perform long-running operation using newspace here
|
||||
# . . .
|
||||
# current workspace has now been replaced by the new one
|
||||
current = workspace.current
|
||||
|
||||
When the context manager completes, it transactionally replaces the current workspace with the new one.
|
||||
|
||||
Extensions
|
||||
==========
|
||||
|
||||
*Multiple Clients*
|
||||
|
||||
Beyond the ability to load and transactionally swap in a single new data set, an application may want to support multiple clients concurrently performing long-running operations on a data set. In this case, the application could perform optimistic validation of an operation before accepting it.
|
||||
|
||||
Code
|
||||
====
|
||||
|
||||
Here's a simple context manager for swapping in a new workspace for the basic usage above.
|
||||
|
||||
::
|
||||
|
||||
class Workspace(object):
|
||||
|
||||
def __init__(self, directory, db):
|
||||
self.dir = directory
|
||||
self.db = db
|
||||
|
||||
def __enter__(self):
|
||||
return self.dir.create_or_open(self.db, (u'new',))
|
||||
|
||||
def __exit__(self, *exc):
|
||||
self._update(self.db)
|
||||
|
||||
@fdb.transactional
|
||||
def _update(self, tr):
|
||||
self.dir.remove(tr, (u'current'))
|
||||
self.dir.move(tr, (u'new'), (u'current'))
|
||||
|
||||
@property
|
||||
def current(self):
|
||||
return self.dir.create_or_open(self.db, (u'current',))
|
|
@ -0,0 +1,168 @@
|
|||
######
|
||||
Tables
|
||||
######
|
||||
|
||||
:doc:`Python <tables>` **Java**
|
||||
|
||||
Goal
|
||||
====
|
||||
|
||||
Create a table data structure suitable for sparse data.
|
||||
|
||||
Challenge
|
||||
=========
|
||||
|
||||
Support efficient random access to individual cells in a table, as well as retrieval of all cells in a particular row or all cells in a particular column.
|
||||
|
||||
Explanation
|
||||
===========
|
||||
|
||||
Tables give us a representation for two-dimensional data with labeled rows and columns. (Column labels are common in data sets. For rows, a primary key, such as an entity ID, can be used.) Each cell in the table will be modeled using two key-value pairs, one in row-dominant order and one in column-dominant order.
|
||||
|
||||
Ordering
|
||||
========
|
||||
|
||||
By storing the table in both row order and column order, we can support efficient retrieval of entire rows or columns with a single range read.
|
||||
|
||||
Pattern
|
||||
=======
|
||||
|
||||
We construct a key from a tuple containing the row and column identifiers. Unassigned cells in the tables will consume no storage, so sparse tables are stored very efficiently. As a result, a table can safely have a very large number of columns.
|
||||
|
||||
Using the lexicographic order of tuples, we can store the data in a row-oriented or column-oriented manner by placing either the row or column first in the tuple, respectively. Placing the row first makes it efficient to read all the cells in a particular row with a single range read; placing the column first makes reading a column efficient. We can support both access patterns by storing cells in both row-oriented and column-oriented layouts, allowing efficient retrieval of either an entire row or an entire column.
|
||||
|
||||
We can create a subspace for the table and nested subspaces for the row and column indexes. Setting a cell would then look like:
|
||||
|
||||
.. code-block:: java
|
||||
|
||||
tr.set(rowIndex.subspace(Tuple.from(row, column)).getKey(), pack(value));
|
||||
tr.set(colIndex.subspace(Tuple.from(column, row)).getKey(), pack(value));
|
||||
|
||||
Extensions
|
||||
==========
|
||||
|
||||
*Higher dimensions*
|
||||
|
||||
This approach can be straightforwardly extended to N dimensions for N > 2. Unless N is small and your data is very sparse, you probably won't want to store all N! index orders, as that could consume a prohibitive amount of space. Instead, you'll want to select the most common access patterns for direct storage.
|
||||
|
||||
Code
|
||||
====
|
||||
|
||||
Here’s a simple implementation of the basic table pattern:
|
||||
|
||||
.. code-block:: java
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.TreeMap;
|
||||
public class MicroTable {
|
||||
private static final FDB fdb;
|
||||
private static final Database db;
|
||||
private static final Subspace table;
|
||||
private static final Subspace rowIndex;
|
||||
private static final Subspace colIndex;
|
||||
|
||||
static {
|
||||
fdb = FDB.selectAPIVersion(510);
|
||||
db = fdb.open();
|
||||
table = new Subspace(Tuple.from("T"));
|
||||
rowIndex = table.subspace(Tuple.from("R"));
|
||||
colIndex = table.subspace(Tuple.from("C"));
|
||||
}
|
||||
|
||||
// Packing and unpacking helper functions.
|
||||
private static byte[] pack(Object value){
|
||||
return Tuple.from(value).pack();
|
||||
}
|
||||
|
||||
private static Object unpack(byte[] value){
|
||||
return Tuple.fromBytes(value).get(0);
|
||||
}
|
||||
|
||||
public static void setCell(TransactionContext tcx, final String row,
|
||||
final String column, final Object value){
|
||||
tcx.run(new Function<Transaction, Void>() {
|
||||
public Void apply(Transaction tr){
|
||||
tr.set(rowIndex.subspace(Tuple.from(row, column)).getKey(),
|
||||
pack(value));
|
||||
tr.set(colIndex.subspace(Tuple.from(column,row)).getKey(),
|
||||
pack(value));
|
||||
|
||||
return null;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
public static Object getCell(TransactionContext tcx, final String row,
|
||||
final String column){
|
||||
return tcx.run(new Function<Transaction, Object>() {
|
||||
public Object apply(Transaction tr){
|
||||
return unpack(tr.get(rowIndex.subspace(
|
||||
Tuple.from(row,column)).getKey()).get());
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
public static void setRow(TransactionContext tcx, final String row,
|
||||
final Map<String,Object> cols){
|
||||
tcx.run(new Function<Transaction, Void>() {
|
||||
public Void apply(Transaction tr){
|
||||
tr.clear(rowIndex.subspace(Tuple.from(row)).range());
|
||||
|
||||
for(Map.Entry<String,Object> cv : cols.entrySet()){
|
||||
setCell(tr, row, cv.getKey(), cv.getValue());
|
||||
}
|
||||
return null;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
public static void setColumn(TransactionContext tcx, final String column,
|
||||
final Map<String,Object> rows){
|
||||
tcx.run(new Function<Transaction,Void>() {
|
||||
public Void apply(Transaction tr){
|
||||
tr.clear(colIndex.subspace(Tuple.from(column)).range());
|
||||
for(Map.Entry<String,Object> rv : rows.entrySet()){
|
||||
setCell(tr, rv.getKey(), column, rv.getValue());
|
||||
}
|
||||
return null;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
public static TreeMap<String,Object> getRow(TransactionContext tcx,
|
||||
final String row){
|
||||
return tcx.run(new Function<Transaction,TreeMap<String,Object> >() {
|
||||
public TreeMap<String,Object> apply(Transaction tr){
|
||||
TreeMap<String,Object> cols = new TreeMap<String,Object>();
|
||||
|
||||
for(KeyValue kv : tr.getRange(
|
||||
rowIndex.subspace(Tuple.from(row)).range())){
|
||||
cols.put(rowIndex.unpack(kv.getKey()).getString(1),
|
||||
unpack(kv.getValue()));
|
||||
}
|
||||
|
||||
return cols;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
public static TreeMap<String,Object> getColumn(TransactionContext tcx,
|
||||
final String column){
|
||||
return tcx.run(new Function<Transaction,TreeMap<String,Object> >() {
|
||||
public TreeMap<String,Object> apply(Transaction tr){
|
||||
TreeMap<String,Object> rows = new TreeMap<String,Object>();
|
||||
|
||||
for(KeyValue kv : tr.getRange(
|
||||
colIndex.subspace(Tuple.from(column)).range())){
|
||||
rows.put(colIndex.unpack(kv.getKey()).getString(1),
|
||||
unpack(kv.getValue()));
|
||||
}
|
||||
|
||||
return rows;
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
That’s about all you need to store and retrieve data from simple tables.
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue