Skip to content

Commit

Permalink
feat: storage backup (#417)
Browse files Browse the repository at this point in the history
keep uploaded content backups for catastrophe recover
  • Loading branch information
vasco-santos authored Sep 27, 2021
1 parent ed8f089 commit ae5423a
Show file tree
Hide file tree
Showing 14 changed files with 19,296 additions and 14,027 deletions.
33,091 changes: 19,100 additions & 13,991 deletions package-lock.json

Large diffs are not rendered by default.

8 changes: 8 additions & 0 deletions packages/api/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,10 @@ One time set up of your cloudflare worker subdomain for dev:
wrangler secret put FAUNA_KEY --env $(whoami) # Get from fauna.com after creating a dev Classic DB
wrangler secret put CLUSTER_BASIC_AUTH_TOKEN --env $(whoami) # Get from web3.storage vault in 1password (not required for dev)
wrangler secret put SENTRY_DSN --env $(whoami) # Get from Sentry (not required for dev)
wrangler secret put S3_BUCKET_REGION --env $(whoami) # e.g us-east-2 (not required for dev)
wrangler secret put S3_ACCESS_KEY_ID --env $(whoami) # Get from Amazon S3 (not required for dev)
wrangler secret put S3_SECRET_ACCESS_KEY_ID --env $(whoami) # Get from Amazon S3 (not required for dev)
wrangler secret put S3_BUCKET_NAME --env $(whoami) # e.g web3.storage-staging-us-east-2 (not required for dev)
```
- `npm run publish` - Publish the worker under your env. An alias for `wrangler publish --env $(whoami)`
Expand Down Expand Up @@ -181,3 +185,7 @@ SENTRY_UPLOAD=false # toggle for sentry source/sourcemaps upload (capture will s
```
Production vars should be set in Github Actions secrets.
## S3 Setup
We use [S3](https://aws.amazon.com/s3/) for backup and disaster recovery. For production an account on AWS needs to be created.
12 changes: 7 additions & 5 deletions packages/api/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,12 @@
"dev": "wrangler dev --env $(whoami)",
"publish": "wrangler publish --env $(whoami)",
"build": "WEBPACK_CLI_FORCE_LOAD_ESM_CONFIG=true webpack",
"test": "npm-run-all -p -r mock:cluster mock:db test:e2e -s test:size",
"test": "npm-run-all -p -r mock:cluster mock:db mock:backup test:e2e -s test:size",
"test:size": "bundlesize",
"test:e2e": "playwright-test \"test/**/*.spec.js\" --sw src/index.js -b webkit",
"mock:cluster": "smoke -p 9094 test/mocks/cluster",
"mock:db": "smoke -p 9086 test/mocks/db"
"mock:db": "smoke -p 9086 test/mocks/db",
"mock:backup": "smoke -p 9096 test/mocks/backup"
},
"devDependencies": {
"@sentry/webpack-plugin": "^1.16.0",
Expand All @@ -37,6 +38,7 @@
"webpack-cli": "^4.7.2"
},
"dependencies": {
"@aws-sdk/client-s3": "^3.28.0",
"@ipld/car": "^3.1.4",
"@ipld/dag-cbor": "^6.0.3",
"@ipld/dag-pb": "^2.0.2",
Expand All @@ -48,13 +50,13 @@
"ipfs-car": "^0.5.8",
"itty-router": "^2.3.10",
"multiformats": "^9.0.4",
"p-retry": "^4.6.1"
"p-retry": "^4.6.1",
"uint8arrays": "^3.0.0"
},
"bundlesize": [
{
"path": "./dist/main.js",
"maxSize": "1 MB",
"compression": "none"
"maxSize": "1 MB"
}
]
}
88 changes: 69 additions & 19 deletions packages/api/src/car.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
/* eslint-env serviceworker */
import { gql } from '@web3-storage/db'
import { PutObjectCommand } from '@aws-sdk/client-s3'
import { CarBlockIterator } from '@ipld/car'
import { toString } from 'uint8arrays'
import { Block } from 'multiformats/block'
import { sha256 } from 'multiformats/hashes/sha2'
import * as raw from 'multiformats/codecs/raw'
import * as cbor from '@ipld/dag-cbor'
import * as pb from '@ipld/dag-pb'
Expand All @@ -10,6 +13,10 @@ import { GATEWAY, LOCAL_ADD_THRESHOLD, MAX_BLOCK_SIZE } from './constants.js'
import { JSONResponse } from './utils/json-response.js'
import { toPinStatusEnum } from './utils/pin.js'

/**
* @typedef {import('multiformats/cid').CID} CID
*/

const decoders = [pb, raw, cbor]

const CREATE_UPLOAD = gql`
Expand Down Expand Up @@ -143,24 +150,12 @@ export async function handleCarUpload (request, env, ctx, car, uploadType = 'Car

// Throws if CAR is invalid by our standards.
// Returns either the sum of the block sizes in the CAR, or the cumulative size of the DAG for a dag-pb root.
const { size: dagSize } = await carStat(car)
const { size: dagSize, rootCid } = await carStat(car)

// Note: We can't make use of `bytes` or `size` properties on the response from cluster.add
// `bytes` is the sum of block sizes in bytes. Where the CAR is a partial, it'll only be a shard of the total dag size.
// `size` is UnixFS FileSize which is 0 for directories, and is not set for raw encoded files, only dag-pb ones.
const { cid/*, bytes, size */ } = await env.cluster.add(car, {
metadata: { size: car.size.toString() },
// When >2.5MB, use local add, because waiting for blocks to be sent to
// other cluster nodes can take a long time. Replication to other nodes
// will be done async by bitswap instead.
local: car.size > LOCAL_ADD_THRESHOLD
})

const { peerMap } = await env.cluster.status(cid)
const pins = toPins(peerMap)
if (!pins.length) { // should not happen
throw new Error('not pinning on any node')
}
const [{ cid, pins }, backupKey] = await Promise.all([
addToCluster(car, env),
backup(car, rootCid, user._id, env)
])

let name = headers.get('x-name')
if (!name || typeof name !== 'string') {
Expand All @@ -178,6 +173,9 @@ export async function handleCarUpload (request, env, ctx, car, uploadType = 'Car
cid,
name,
type: uploadType,
backupUrls: backupKey
? [`https://${env.s3BucketName}.s3.${env.s3BucketRegion}.amazonaws.com/${backupKey}`]
: [],
pins,
dagSize
}
Expand Down Expand Up @@ -256,6 +254,58 @@ export async function sizeOf (response) {
return size
}

/**
* Adds car to local cluster and returns its content identifier and pins
*
* @param {Blob} car
* @param {import('./env').Env} env
*/
async function addToCluster (car, env) {
// Note: We can't make use of `bytes` or `size` properties on the response from cluster.add
// `bytes` is the sum of block sizes in bytes. Where the CAR is a partial, it'll only be a shard of the total dag size.
// `size` is UnixFS FileSize which is 0 for directories, and is not set for raw encoded files, only dag-pb ones.
const { cid } = await env.cluster.add(car, {
metadata: { size: car.size.toString() },
// When >2.5MB, use local add, because waiting for blocks to be sent to
// other cluster nodes can take a long time. Replication to other nodes
// will be done async by bitswap instead.
local: car.size > LOCAL_ADD_THRESHOLD
})

const { peerMap } = await env.cluster.status(cid)
const pins = toPins(peerMap)
if (!pins.length) { // should not happen
throw new Error('not pinning on any node')
}

return { cid, pins }
}

/**
* Backup given Car file keyed by /raw/${rootCid}/${userId}/${carHash}.car
* @param {Blob} blob
* @param {CID} rootCid
* @param {string} userId
* @param {import('../env').Env} env
*/
async function backup (blob, rootCid, userId, env) {
if (!env.s3Client) {
return undefined
}

const data = await blob.arrayBuffer()
const dataHash = await sha256.digest(new Uint8Array(data))
const keyStr = `raw/${rootCid.toString()}/${userId}/${toString(dataHash.bytes, 'base32')}.car`
const cmdParams = {
Bucket: env.s3BucketName,
Key: keyStr,
Body: blob
}

await env.s3Client.send(new PutObjectCommand(cmdParams))
return keyStr
}

/**
* Returns the sum of all block sizes and total blocks. Throws if the CAR does
* not conform to our idea of a valid CAR i.e.
Expand All @@ -266,7 +316,7 @@ export async function sizeOf (response) {
* - Missing root block
* - Missing non-root blocks (when root block has links)
*
* @typedef {{ size: number, blocks: number }} CarStat
* @typedef {{ size: number, blocks: number, rootCid: CID }} CarStat
* @param {Blob} carBlob
* @returns {Promise<CarStat>}
*/
Expand Down Expand Up @@ -314,7 +364,7 @@ async function carStat (carBlob) {
size = cumulativeSize(rootBlock.bytes, rootBlock.value)
}
}
return { size, blocks }
return { size, blocks, rootCid }
}

/**
Expand Down
34 changes: 32 additions & 2 deletions packages/api/src/env.js
Original file line number Diff line number Diff line change
@@ -1,12 +1,22 @@
/* global MAGIC_SECRET_KEY FAUNA_ENDPOINT FAUNA_KEY SALT CLUSTER_BASIC_AUTH_TOKEN CLUSTER_API_URL SENTRY_DSN, VERSION DANGEROUSLY_BYPASS_MAGIC_AUTH */
/* global MAGIC_SECRET_KEY FAUNA_ENDPOINT FAUNA_KEY SALT CLUSTER_BASIC_AUTH_TOKEN CLUSTER_API_URL SENTRY_DSN, VERSION DANGEROUSLY_BYPASS_MAGIC_AUTH S3_BUCKET_ENDPOINT S3_BUCKET_NAME S3_BUCKET_REGION S3_ACCESS_KEY_ID S3_SECRET_ACCESS_KEY_ID */
import Toucan from 'toucan-js'
import { S3Client } from '@aws-sdk/client-s3'
import { Magic } from '@magic-sdk/admin'
import { DBClient } from '@web3-storage/db'
import { Cluster } from '@nftstorage/ipfs-cluster'

import pkg from '../package.json'

/** @typedef {{ magic: Magic, db: DBClient, SALT: string }} Env */
/**
* @typedef {object} Env
* @property {Cluster} cluster
* @property {Magic} magic
* @property {DBClient} db
* @property {string} SALT
* @property {S3Client} [s3Client]
* @property {string} [s3BucketName]
* @property {string} [s3BucketRegion]
*/

/**
* @param {Request} req
Expand Down Expand Up @@ -49,4 +59,24 @@ export function envAll (_, env, event) {
const clusterAuthToken = env.CLUSTER_BASIC_AUTH_TOKEN || (typeof CLUSTER_BASIC_AUTH_TOKEN === 'undefined' ? undefined : CLUSTER_BASIC_AUTH_TOKEN)
const headers = clusterAuthToken ? { Authorization: `Basic ${clusterAuthToken}` } : {}
env.cluster = new Cluster(env.CLUSTER_API_URL || CLUSTER_API_URL, { headers })

// backups not required in dev mode
if (env.ENV === 'dev' && !(env.S3_ACCESS_KEY_ID || typeof S3_ACCESS_KEY_ID !== 'undefined')) {
console.log('running without backups wired up')
} else {
const s3Endpoint = env.S3_BUCKET_ENDPOINT || (typeof S3_BUCKET_ENDPOINT === 'undefined' ? undefined : S3_BUCKET_ENDPOINT)

env.s3BucketName = env.S3_BUCKET_NAME || S3_BUCKET_NAME
env.s3BucketRegion = env.S3_BUCKET_REGION || S3_BUCKET_REGION

env.s3Client = new S3Client({
endpoint: s3Endpoint,
forcePathStyle: !!s3Endpoint, // Force path if endpoint provided
region: env.S3_BUCKET_REGION || S3_BUCKET_REGION,
credentials: {
accessKeyId: env.S3_ACCESS_KEY_ID || S3_ACCESS_KEY_ID,
secretAccessKey: env.S3_SECRET_ACCESS_KEY_ID || S3_SECRET_ACCESS_KEY_ID
}
})
}
}
1 change: 1 addition & 0 deletions packages/api/test/car.spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import { CID } from 'multiformats/cid'
import { sha256 } from 'multiformats/hashes/sha2'
import * as pb from '@ipld/dag-pb'
import { CarWriter } from '@ipld/car'

import { endpoint } from './scripts/constants.js'
import * as JWT from '../src/utils/jwt.js'
import { SALT } from './scripts/worker-globals.js'
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
/**
* https://github.com/sinedied/smoke#javascript-mocks
*/
module.exports = () => {
return {
statusCode: 200,
headers: { 'Content-Type': 'application/json' }
}
}
5 changes: 5 additions & 0 deletions packages/api/test/scripts/worker-globals.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,11 @@ export const FAUNA_KEY = 'test-fauna-key'
export const MAGIC_SECRET_KEY = 'test-magic-secret-key'
export const CLUSTER_API_URL = 'http://localhost:9094'
export const CLUSTER_BASIC_AUTH_TOKEN = 'test'
export const S3_BUCKET_ENDPOINT = 'http://localhost:9096'
export const S3_BUCKET_NAME = 'bucket'
export const S3_BUCKET_REGION = 'eu-central-1'
export const S3_ACCESS_KEY_ID = 'access-key-id'
export const S3_SECRET_ACCESS_KEY_ID = 'secret-access-key'

// Can be removed once we get a test mode for admin magic sdk.
export const DANGEROUSLY_BYPASS_MAGIC_AUTH = true
3 changes: 2 additions & 1 deletion packages/api/webpack.config.js
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ export default {
}
},
optimization: {
minimize: true
minimize: true,
usedExports: true
}
}
18 changes: 9 additions & 9 deletions packages/api/wrangler.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,44 +25,44 @@ format = "service-worker"
account_id = "fffa4b4363a7e5250af8357087263b3a" # Protocol Labs CF account
zone_id = "7eee3323c1b35b6650568604c65f441e" # web3.storage zone
route = "https://api.web3.storage/*"
vars = { CLUSTER_API_URL = "https://web3.storage.ipfscluster.io/api/" }
vars = { CLUSTER_API_URL = "https://web3.storage.ipfscluster.io/api/", ENV = "production" }

[env.staging]
# name = "web3-storage-staging"
account_id = "fffa4b4363a7e5250af8357087263b3a" # Protocol Labs CF account
zone_id = "7eee3323c1b35b6650568604c65f441e" # web3.storage zone
route = "https://api-staging.web3.storage/*"
vars = { CLUSTER_API_URL = "https://web3.storage.ipfscluster.io/api/" }
vars = { CLUSTER_API_URL = "https://web3.storage.ipfscluster.io/api/", ENV = "staging" }

[env.alan]
workers_dev = true
account_id = "4fe12d085474d33bdcfd8e9bed4d8f95"
vars = { CLUSTER_API_URL = "https://alan-cluster-api-web3-storage.loca.lt" }
vars = { CLUSTER_API_URL = "https://alan-cluster-api-web3-storage.loca.lt", ENV = "dev" }

[env.oli]
workers_dev = true
account_id = "6e5a2aed80cd37d77e8d0c797a75ebbd"
vars = { CLUSTER_API_URL = "https://oli-cluster-api-web3-storage.loca.lt" }
vars = { CLUSTER_API_URL = "https://oli-cluster-api-web3-storage.loca.lt", ENV = "dev" }

[env.yusef]
workers_dev = true
account_id = "8c3da25233263bd7a26c0e2e04569ded"
vars = { CLUSTER_API_URL = "https://yusef-cluster-api-web3-storage.loca.lt" }
vars = { CLUSTER_API_URL = "https://yusef-cluster-api-web3-storage.loca.lt", ENV = "dev" }

[env.vsantos]
workers_dev = true
account_id = "7ec0b7cf2ec201b2580374e53ba5f37b"
vars = { CLUSTER_API_URL = "https://vsantos-cluster-api-web3-storage.loca.lt" }
vars = { CLUSTER_API_URL = "https://vsantos-cluster-api-web3-storage.loca.lt", ENV = "dev" }

[env.rafaelramalho]
workers_dev = true
account_id = "83d74d9c17d37ac07ea4f27ffa927626"
vars = { CLUSTER_API_URL = "https://rafaelramalho-cluster-api-web3-storage.loca.lt/" }
vars = { CLUSTER_API_URL = "https://rafaelramalho-cluster-api-web3-storage.loca.lt/", ENV = "dev" }

[env.joaopeixoto]
workers_dev = true
account_id = "d7a6d34e62065b452c82b7fe992b8d88"
vars = { CLUSTER_API_URL = "https://joaopeixoto-cluster-api-web3-storage.loca.lt" }
vars = { CLUSTER_API_URL = "https://joaopeixoto-cluster-api-web3-storage.loca.lt", ENV = "dev" }

# Add your env here. Override the the values you need to change.

Expand All @@ -71,4 +71,4 @@ vars = { CLUSTER_API_URL = "https://joaopeixoto-cluster-api-web3-storage.loca.lt
# [env.${whoami}]
# workers_dev = true
# account_id = "<get me from `wrangler whoami`"
# vars = { CLUSTER_API_URL = "https://<your ${whoami} here>-cluster-api-web3-storage.loca.lt" }
# vars = { CLUSTER_API_URL = "https://<your ${whoami} here>-cluster-api-web3-storage.loca.lt", ENV = "dev" }
26 changes: 26 additions & 0 deletions packages/db/fauna/resources/Function/createUpload.js
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,19 @@ const body = Query(
)
)
),
Foreach(
Select('backupUrls', Var('data')),
Lambda(
['url'],
Create('Backup', {
data: {
upload: Select('ref', Var('upload')),
url: Var('url'),
created: Now()
}
})
)
),
Var('upload')
)
)
Expand Down Expand Up @@ -163,6 +176,19 @@ const body = Query(
)
)
),
Foreach(
Select('backupUrls', Var('data')),
Lambda(
['url'],
Create('Backup', {
data: {
upload: Select('ref', Var('upload')),
url: Var('url'),
created: Now()
}
})
)
),
Var('upload')
)
)
Expand Down
Loading

0 comments on commit ae5423a

Please sign in to comment.