Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP]: feat(gatsby): use lokijs for in-memory database #9338

Closed
wants to merge 35 commits into from
Closed
Show file tree
Hide file tree
Changes from 33 commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
64a7149
Added query benchmark
Moocar Oct 4, 2018
ae5de27
Merge branch 'query-benchmark' into integrate-loki
Moocar Oct 19, 2018
4da734d
draft of new run-query
Moocar Oct 19, 2018
0fe2778
added query-sift.js
Moocar Oct 19, 2018
361bb82
move page dependency tracking into build-node-types
Moocar Oct 19, 2018
5364fc5
clean up
Moocar Oct 19, 2018
9e99174
add promise to query-loki
Moocar Oct 19, 2018
409c029
Replaced redux nodes with loki
Moocar Oct 19, 2018
8438bad
create indexes for each field
Moocar Oct 19, 2018
8303810
Dynamically create pages across generated types
Moocar Oct 20, 2018
0172556
update recordings
Moocar Oct 20, 2018
997b826
Merge branch 'query-benchmark' into integrate-loki
Moocar Oct 20, 2018
93f9d91
fix build-node-connections
Moocar Oct 20, 2018
a21efee
make plugin ids unique
Moocar Oct 20, 2018
93c60f7
added tests to loki
Moocar Oct 20, 2018
d3dc9be
getNodesByType
Moocar Oct 21, 2018
d2d9353
format graphql query
Moocar Oct 23, 2018
dd7e631
recordings
Moocar Oct 23, 2018
e9bcfb4
Merge branch 'master' into integrate-loki
Moocar Oct 23, 2018
968ef2b
Added persistence
Moocar Oct 23, 2018
75dae87
linter fixes
Moocar Oct 23, 2018
79a5685
query benchmark PR udpates
Moocar Oct 23, 2018
62aee97
docs
Moocar Oct 23, 2018
ac97c30
docs
Moocar Oct 23, 2018
e936d0e
plugin-fields.js and move fns to db/index
Moocar Oct 23, 2018
d1be8e8
docs
Moocar Oct 23, 2018
9aae188
docs
Moocar Oct 24, 2018
feb0e1b
handle firstOnly
Moocar Oct 24, 2018
1eab83f
docs
Moocar Oct 24, 2018
443b169
fixed
Moocar Oct 24, 2018
059b485
remove loki fs adapter
Moocar Oct 24, 2018
760817e
benchmark machine
Moocar Oct 24, 2018
187adb9
remove unused fn
Moocar Oct 24, 2018
be39c66
only track loki nodes once db is loaded
Moocar Oct 24, 2018
0ec21f7
Merge branch 'master' into integrate-loki
Moocar Oct 25, 2018
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions benchmarks/query/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Query benchmark

Stress tests creating lots of queries.

Defaults to building a site with 5k pages split evenly amongst 10 types. Set the `NUM_PAGES` environment variable to change the number of pages, and `NUM_TYPES` to change the number of types they're split over. E.g to create a site with 5 types, each with 200 pages, do `NUM_TYPES=5 NUM_PAGES=1000 gatsby build`
8 changes: 8 additions & 0 deletions benchmarks/query/bin/runQueryTiming.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Run the build (after purging .cache) and output the amount of time
# taken by the query execution phase
#
# run with `bin/runQueryTiming.sh`

output=$(rm -rf .cache && gatsby build | grep "run graphql queries")
echo $output | cut -d' ' -f 6

117 changes: 117 additions & 0 deletions benchmarks/query/gatsby-node.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
const _ = require(`lodash`)
const faker = require(`faker`)
const fs = require(`fs`)

let NUM_PAGES = 5000
if (process.env.NUM_PAGES) {
NUM_PAGES = process.env.NUM_PAGES
}

let NUM_TYPES = 1
if (process.env.NUM_TYPES) {
NUM_TYPES = process.env.NUM_TYPES
}

function newTypeName() {
return _.capitalize(_.camelCase(faker.lorem.word()))
}

let types = []

// Create NUM_PAGES nodes, split over NUM_TYPES types. Each node has
// the bare minimum of content
exports.sourceNodes = ({ actions: { createNode } }) => {
for (var i = 0; i < NUM_TYPES; i++) {
types.push(newTypeName())
}
// Create markdown nodes
const pagesPerType = NUM_PAGES / NUM_TYPES

let step = 0

_.forEach(types, typeName => {
for (var i = 0; i < pagesPerType; i++) {
step++
const id = `${typeName}${step.toString()}`
createNode({
id,
parent: null,
children: [],
internal: {
type: typeName,
nestedId: id,
content: faker.lorem.word(),
contentDigest: step.toString(),
},
})
}
})
}

// Total hack. It would be nice if we could programatically generate
// graphQL per component. But in the meantime, we just generate the
// actual component js file with the graphql
function createPageTemplateJs(typeName) {
const lowerTypeName = _.lowerFirst(typeName)
return `
import React from "react"
import { graphql } from "gatsby"

export default ({ data }) => {
const node = data["${lowerTypeName}"]
return (
<div>
<h1>{node.id}. Not much ey</h1>
</div>
)
}

export const query = graphql\`
query($id: String!) {
${lowerTypeName}(internal: { nestedId: { eq: $id } }) {
id
}
}
\`
`
}

function allTypeQuery(typeName) {
return `
{
all${typeName}(sort: { fields: [id] }) {
edges {
node {
id
}
}
}
}
`
}

// Create a page for each node, and write out a new component js for
// each different type to .cache/${typeName}Template.js
async function createTypePages({ graphql, actions }, typeName) {
const templateSrc = createPageTemplateJs(typeName)
const templateFilename = `./.cache/${typeName}Template.js`
fs.writeFileSync(templateFilename, templateSrc)
let result = await graphql(allTypeQuery(typeName))
_.forEach(result.data[`all${typeName}`].edges, edge => {
const { node } = edge
actions.createPage({
path: `/${typeName}/${node.id}/`,
component: require.resolve(templateFilename),
context: {
id: node.id,
useQueryIndex: true,
},
})
})
}

exports.createPages = async args => {
_.forEach(types, typeName => {
createTypePages(args, typeName)
})
}
17 changes: 17 additions & 0 deletions benchmarks/query/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
"name": "query-benchmark",
"description": "Benchmarks for Gatsby query performance",
"license": "MIT",
"scripts": {
"develop": "gatsby develop",
"build": "gatsby build",
"serve": "gatsby serve"
},
"dependencies": {
"faker": "^4.1.0",
"gatsby": "next",
"lodash": "^4.17.11",
"react": "^16.3.2",
"react-dom": "^16.3.2"
}
}
88 changes: 88 additions & 0 deletions benchmarks/query/recording.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
## Summary

Key findings:

- loki without indexes is overall slightly faster than master, except when there are many types
- loki with indexes is about 2x faster on sites with 10k pages, and 5x faster with 20k pages. But is ever so slightly slower when those pages are split across 100 types.

Overall, loki is a big win for sites with lots of pages of the same type. For smaller sites, the difference is negligible.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A caching PR I recently worked on had similar findings, which I think is totally reasonable. If it's mostly the same for smaller sites but it makes Gatsby much more scalable I think that's a big win.

I honestly wonder if using lokijs consistently (e.g. for cache and internal Gatsby utilities) would be worthwhile. I'd lean towards yes!

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The fs fallback is interesting, since I used a similar approach for the cache! Basically I kept 100-200 items in memory, and then it fell back to the filesystem as its cache, which seems to work pretty well for large sites.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Moocar did you measure how much memory is used in these different benchmarks?

@DSchau same question?

Copy link
Contributor

@DSchau DSchau Oct 24, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@KyleAMathews not explicitly (although I can), but I used the upwards bounds of when it ran out of memory as the basis for which I considered the number of items. It is dependent on machine setup, size of items being cached, etc. too.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@DSchau Thanks for all the feedback! For caching, using loki would be a step up from using pure arrays, but an even bigger win would probably be to use a real caching library that can also provide persistence. Once that has LRU, paging to disk and other cache-specific semantics, assuming one exists! (I haven't looked)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@KyleAMathews Memory usage should be less since we're actually able to remove some caches. But I'll double check. What's your favorite memory profiling tool?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's what I assume too but it's nice to know for sure.

There's probably something on npm but you can also add a setInterval in the code and log out the memory usage every .1 seconds or whatever. You could also log out different events too e.g. when query running starts etc. so you could visualize things easily.


## Benchmarks

Performed on 2018 13" MBP. 4-core 2.7 GHz Intel Core i7. 16 GB 2133 MHz LPDDR3

### Gatsby master

- Gatsby: master

```
query $ NUM_TYPES=1 NUM_PAGES=10000 bin/runQueryTiming.sh
21.135
```

```
query $ NUM_TYPES=100 NUM_PAGES=10000 bin/runQueryTiming.sh
13.112
```

```
query $ NUM_TYPES=1 NUM_PAGES=20000 bin/runQueryTiming.sh
67.812
```

```
query $ NUM_TYPES=100 NUM_PAGES=20000 bin/runQueryTiming.sh
24.656
```

### Gatsby loki without index

- Gatsby:loki
- Index = false
- loki nested index patch

```
query $ NUM_TYPES=1 NUM_PAGES=10000 bin/runQueryTiming.sh
14.834
```

```
query $ NUM_TYPES=100 NUM_PAGES=10000 bin/runQueryTiming.sh
14.676
```

```
query $ NUM_TYPES=1 NUM_PAGES=20000 bin/runQueryTiming.sh
58.377
```

```
query $ NUM_TYPES=100 NUM_PAGES=20000 bin/runQueryTiming.sh
27.486
```

### Gatsby loki with index

- Gatsby:loki
- Index = true
- loki nested index patch

```
query $ NUM_TYPES=1 NUM_PAGES=10000 bin/runQueryTiming.sh
8.126
```

```
query $ NUM_TYPES=100 NUM_PAGES=10000 bin/runQueryTiming.sh
15.050
```

```
query $ NUM_TYPES=1 NUM_PAGES=20000 bin/runQueryTiming.sh
12.797
```

```
query $ NUM_TYPES=100 NUM_PAGES=20000 bin/runQueryTiming.sh
27.020
```
3 changes: 3 additions & 0 deletions benchmarks/query/src/pages/index.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
import React from "react"

export default () => <div>Hello world!</div>
7 changes: 3 additions & 4 deletions packages/gatsby-source-contentful/src/gatsby-node.js
Original file line number Diff line number Diff line change
Expand Up @@ -216,17 +216,16 @@ exports.sourceNodes = async (
// Check if there are any ContentfulAsset nodes and if gatsby-image is installed. If so,
// add fragments for ContentfulAsset and gatsby-image. The fragment will cause an error
// if there's not ContentfulAsset nodes and without gatsby-image, the fragment is useless.
exports.onPreExtractQueries = async ({ store, getNodes }) => {
exports.onPreExtractQueries = async ({ store, getNodesByType }) => {
const program = store.getState().program

const CACHE_DIR = path.resolve(
`${program.directory}/.cache/contentful/assets/`
)
await fs.ensureDir(CACHE_DIR)

const nodes = getNodes()

if (!nodes.some(n => n.internal.type === `ContentfulAsset`)) {
const nodes = getNodesByType(`ContentfulAsset`)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll look through the implementation of this as I go through, but would this be worth returning an empty array or something so we can resolve the empty case more easily?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah it would save us all of those checks below

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep! Good call. I'll return the empty collection

if (!nodes || !nodes.length || nodes.length === 0) {
return
}

Expand Down
24 changes: 18 additions & 6 deletions packages/gatsby-transformer-remark/src/extend-node-type.js
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ const withPathPrefix = (url, pathPrefix) =>
const ASTPromiseMap = new Map()

module.exports = (
{ type, store, pathPrefix, getNode, getNodes, cache, reporter },
{ type, store, pathPrefix, getNode, getNodesByType, cache, reporter },
pluginOptions
) => {
if (type.name !== `MarkdownRemark`) {
Expand All @@ -74,7 +74,13 @@ module.exports = (

return new Promise((resolve, reject) => {
// Setup Remark.
const { commonmark = true, footnotes = true, pedantic = true, gfm = true, blocks } = pluginOptions
const {
commonmark = true,
footnotes = true,
pedantic = true,
gfm = true,
blocks,
} = pluginOptions
const remarkOptions = {
gfm,
commonmark,
Expand Down Expand Up @@ -113,7 +119,7 @@ module.exports = (
} else {
const ASTGenerationPromise = new Promise(async resolve => {
if (process.env.NODE_ENV !== `production` || !fileNodes) {
fileNodes = getNodes().filter(n => n.internal.type === `File`)
fileNodes = getNodesByType(`File`)
}
const ast = await new Promise((resolve, reject) => {
// Use Bluebird's Promise function "each" to run remark plugins serially.
Expand Down Expand Up @@ -180,7 +186,7 @@ module.exports = (
// typegen plugins just modify the auto-generated types to add derived fields
// as well as computationally expensive fields.
if (process.env.NODE_ENV !== `production` || !fileNodes) {
fileNodes = getNodes().filter(n => n.internal.type === `File`)
fileNodes = getNodesByType(`File`)
}
// Use Bluebird's Promise function "each" to run remark plugins serially.
Promise.each(pluginOptions.plugins, plugin => {
Expand Down Expand Up @@ -249,10 +255,16 @@ module.exports = (
const addSlugToUrl = function(node) {
if (node.url) {
if (_.get(markdownNode, pathToSlugField) === undefined) {
console.warn(`Skipping TableOfContents. Field '${pathToSlugField}' missing from markdown node`)
console.warn(
`Skipping TableOfContents. Field '${pathToSlugField}' missing from markdown node`
)
return null
}
node.url = [pathPrefix, _.get(markdownNode, pathToSlugField), node.url]
node.url = [
pathPrefix,
_.get(markdownNode, pathToSlugField),
node.url,
]
.join(`/`)
.replace(/\/\//g, `/`)
}
Expand Down
6 changes: 2 additions & 4 deletions packages/gatsby-transformer-screenshot/src/gatsby-node.js
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,11 @@ const screenshotQueue = new Queue(
)

exports.onPreBootstrap = (
{ store, cache, actions, createNodeId, getNodes, createContentDigest },
{ store, cache, actions, createNodeId, getNodesByType, createContentDigest },
pluginOptions
) => {
const { createNode, touchNode } = actions
const screenshotNodes = getNodes().filter(
n => n.internal.type === `Screenshot`
)
const screenshotNodes = getNodesByType(`Screenshot`)

if (screenshotNodes.length === 0) {
return null
Expand Down
7 changes: 3 additions & 4 deletions packages/gatsby-transformer-sharp/src/gatsby-node.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,13 @@ const fs = require(`fs-extra`)
exports.onCreateNode = require(`./on-node-create`)
exports.setFieldsOnGraphQLNodeType = require(`./extend-node-type`)

exports.onPreExtractQueries = async ({ store, getNodes }) => {
exports.onPreExtractQueries = async ({ store, getNodesByType }) => {
const program = store.getState().program

// Check if there are any ImageSharp nodes. If so add fragments for ImageSharp.
// The fragment will cause an error if there are no ImageSharp nodes.
const nodes = getNodes()

if (!nodes.some(n => n.internal.type === `ImageSharp`)) {
const nodes = getNodesByType(`ImageSharp`)
if (!nodes || !nodes.length || nodes.length === 0) {
return
}

Expand Down
1 change: 1 addition & 0 deletions packages/gatsby/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@
"json-stringify-safe": "^5.0.1",
"kebab-hash": "^0.1.2",
"lodash": "^4.17.10",
"lokijs": "^1.5.5",
"md5": "^2.2.1",
"md5-file": "^3.1.1",
"mime": "^2.2.0",
Expand Down
Loading