-
Notifications
You must be signed in to change notification settings - Fork 10.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[WIP]: feat(gatsby): use lokijs for in-memory database #9338
Changes from 34 commits
64a7149
ae5de27
4da734d
0fe2778
361bb82
5364fc5
9e99174
409c029
8438bad
8303810
0172556
997b826
93f9d91
a21efee
93c60f7
d3dc9be
d2d9353
dd7e631
e9bcfb4
968ef2b
75dae87
79a5685
62aee97
ac97c30
e936d0e
d1be8e8
9aae188
feb0e1b
1eab83f
443b169
059b485
760817e
187adb9
be39c66
0ec21f7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
# Query benchmark | ||
|
||
Stress tests creating lots of queries. | ||
|
||
Defaults to building a site with 5k pages split evenly amongst 10 types. Set the `NUM_PAGES` environment variable to change the number of pages, and `NUM_TYPES` to change the number of types they're split over. E.g to create a site with 5 types, each with 200 pages, do `NUM_TYPES=5 NUM_PAGES=1000 gatsby build` |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
# Run the build (after purging .cache) and output the amount of time | ||
# taken by the query execution phase | ||
# | ||
# run with `bin/runQueryTiming.sh` | ||
|
||
output=$(rm -rf .cache && gatsby build | grep "run graphql queries") | ||
echo $output | cut -d' ' -f 6 | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
const _ = require(`lodash`) | ||
const faker = require(`faker`) | ||
const fs = require(`fs`) | ||
|
||
let NUM_PAGES = 5000 | ||
if (process.env.NUM_PAGES) { | ||
NUM_PAGES = process.env.NUM_PAGES | ||
} | ||
|
||
let NUM_TYPES = 1 | ||
if (process.env.NUM_TYPES) { | ||
NUM_TYPES = process.env.NUM_TYPES | ||
} | ||
|
||
function newTypeName() { | ||
return _.capitalize(_.camelCase(faker.lorem.word())) | ||
} | ||
|
||
let types = [] | ||
|
||
// Create NUM_PAGES nodes, split over NUM_TYPES types. Each node has | ||
// the bare minimum of content | ||
exports.sourceNodes = ({ actions: { createNode } }) => { | ||
for (var i = 0; i < NUM_TYPES; i++) { | ||
types.push(newTypeName()) | ||
} | ||
// Create markdown nodes | ||
const pagesPerType = NUM_PAGES / NUM_TYPES | ||
|
||
let step = 0 | ||
|
||
_.forEach(types, typeName => { | ||
for (var i = 0; i < pagesPerType; i++) { | ||
step++ | ||
const id = `${typeName}${step.toString()}` | ||
createNode({ | ||
id, | ||
parent: null, | ||
children: [], | ||
internal: { | ||
type: typeName, | ||
nestedId: id, | ||
content: faker.lorem.word(), | ||
contentDigest: step.toString(), | ||
}, | ||
}) | ||
} | ||
}) | ||
} | ||
|
||
// Total hack. It would be nice if we could programatically generate | ||
// graphQL per component. But in the meantime, we just generate the | ||
// actual component js file with the graphql | ||
function createPageTemplateJs(typeName) { | ||
const lowerTypeName = _.lowerFirst(typeName) | ||
return ` | ||
import React from "react" | ||
import { graphql } from "gatsby" | ||
|
||
export default ({ data }) => { | ||
const node = data["${lowerTypeName}"] | ||
return ( | ||
<div> | ||
<h1>{node.id}. Not much ey</h1> | ||
</div> | ||
) | ||
} | ||
|
||
export const query = graphql\` | ||
query($id: String!) { | ||
${lowerTypeName}(internal: { nestedId: { eq: $id } }) { | ||
id | ||
} | ||
} | ||
\` | ||
` | ||
} | ||
|
||
function allTypeQuery(typeName) { | ||
return ` | ||
{ | ||
all${typeName}(sort: { fields: [id] }) { | ||
edges { | ||
node { | ||
id | ||
} | ||
} | ||
} | ||
} | ||
` | ||
} | ||
|
||
// Create a page for each node, and write out a new component js for | ||
// each different type to .cache/${typeName}Template.js | ||
async function createTypePages({ graphql, actions }, typeName) { | ||
const templateSrc = createPageTemplateJs(typeName) | ||
const templateFilename = `./.cache/${typeName}Template.js` | ||
fs.writeFileSync(templateFilename, templateSrc) | ||
let result = await graphql(allTypeQuery(typeName)) | ||
_.forEach(result.data[`all${typeName}`].edges, edge => { | ||
const { node } = edge | ||
actions.createPage({ | ||
path: `/${typeName}/${node.id}/`, | ||
component: require.resolve(templateFilename), | ||
context: { | ||
id: node.id, | ||
useQueryIndex: true, | ||
}, | ||
}) | ||
}) | ||
} | ||
|
||
exports.createPages = async args => { | ||
_.forEach(types, typeName => { | ||
createTypePages(args, typeName) | ||
}) | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
{ | ||
"name": "query-benchmark", | ||
"description": "Benchmarks for Gatsby query performance", | ||
"license": "MIT", | ||
"scripts": { | ||
"develop": "gatsby develop", | ||
"build": "gatsby build", | ||
"serve": "gatsby serve" | ||
}, | ||
"dependencies": { | ||
"faker": "^4.1.0", | ||
"gatsby": "next", | ||
"lodash": "^4.17.11", | ||
"react": "^16.3.2", | ||
"react-dom": "^16.3.2" | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
## Summary | ||
|
||
Key findings: | ||
|
||
- loki without indexes is overall slightly faster than master, except when there are many types | ||
- loki with indexes is about 2x faster on sites with 10k pages, and 5x faster with 20k pages. But is ever so slightly slower when those pages are split across 100 types. | ||
|
||
Overall, loki is a big win for sites with lots of pages of the same type. For smaller sites, the difference is negligible. | ||
|
||
## Benchmarks | ||
|
||
Performed on 2018 13" MBP. 4-core 2.7 GHz Intel Core i7. 16 GB 2133 MHz LPDDR3 | ||
|
||
### Gatsby master | ||
|
||
- Gatsby: master | ||
|
||
``` | ||
query $ NUM_TYPES=1 NUM_PAGES=10000 bin/runQueryTiming.sh | ||
21.135 | ||
``` | ||
|
||
``` | ||
query $ NUM_TYPES=100 NUM_PAGES=10000 bin/runQueryTiming.sh | ||
13.112 | ||
``` | ||
|
||
``` | ||
query $ NUM_TYPES=1 NUM_PAGES=20000 bin/runQueryTiming.sh | ||
67.812 | ||
``` | ||
|
||
``` | ||
query $ NUM_TYPES=100 NUM_PAGES=20000 bin/runQueryTiming.sh | ||
24.656 | ||
``` | ||
|
||
### Gatsby loki without index | ||
|
||
- Gatsby:loki | ||
- Index = false | ||
- loki nested index patch | ||
|
||
``` | ||
query $ NUM_TYPES=1 NUM_PAGES=10000 bin/runQueryTiming.sh | ||
14.834 | ||
``` | ||
|
||
``` | ||
query $ NUM_TYPES=100 NUM_PAGES=10000 bin/runQueryTiming.sh | ||
14.676 | ||
``` | ||
|
||
``` | ||
query $ NUM_TYPES=1 NUM_PAGES=20000 bin/runQueryTiming.sh | ||
58.377 | ||
``` | ||
|
||
``` | ||
query $ NUM_TYPES=100 NUM_PAGES=20000 bin/runQueryTiming.sh | ||
27.486 | ||
``` | ||
|
||
### Gatsby loki with index | ||
|
||
- Gatsby:loki | ||
- Index = true | ||
- loki nested index patch | ||
|
||
``` | ||
query $ NUM_TYPES=1 NUM_PAGES=10000 bin/runQueryTiming.sh | ||
8.126 | ||
``` | ||
|
||
``` | ||
query $ NUM_TYPES=100 NUM_PAGES=10000 bin/runQueryTiming.sh | ||
15.050 | ||
``` | ||
|
||
``` | ||
query $ NUM_TYPES=1 NUM_PAGES=20000 bin/runQueryTiming.sh | ||
12.797 | ||
``` | ||
|
||
``` | ||
query $ NUM_TYPES=100 NUM_PAGES=20000 bin/runQueryTiming.sh | ||
27.020 | ||
``` |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
import React from "react" | ||
|
||
export default () => <div>Hello world!</div> |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -216,17 +216,16 @@ exports.sourceNodes = async ( | |
// Check if there are any ContentfulAsset nodes and if gatsby-image is installed. If so, | ||
// add fragments for ContentfulAsset and gatsby-image. The fragment will cause an error | ||
// if there's not ContentfulAsset nodes and without gatsby-image, the fragment is useless. | ||
exports.onPreExtractQueries = async ({ store, getNodes }) => { | ||
exports.onPreExtractQueries = async ({ store, getNodesByType }) => { | ||
const program = store.getState().program | ||
|
||
const CACHE_DIR = path.resolve( | ||
`${program.directory}/.cache/contentful/assets/` | ||
) | ||
await fs.ensureDir(CACHE_DIR) | ||
|
||
const nodes = getNodes() | ||
|
||
if (!nodes.some(n => n.internal.type === `ContentfulAsset`)) { | ||
const nodes = getNodesByType(`ContentfulAsset`) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'll look through the implementation of this as I go through, but would this be worth returning an empty array or something so we can resolve the empty case more easily? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah it would save us all of those checks below There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep! Good call. I'll return the empty collection |
||
if (!nodes || !nodes.length || nodes.length === 0) { | ||
return | ||
} | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
A caching PR I recently worked on had similar findings, which I think is totally reasonable. If it's mostly the same for smaller sites but it makes Gatsby much more scalable I think that's a big win.
I honestly wonder if using lokijs consistently (e.g. for cache and internal Gatsby utilities) would be worthwhile. I'd lean towards yes!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The fs fallback is interesting, since I used a similar approach for the cache! Basically I kept 100-200 items in memory, and then it fell back to the filesystem as its cache, which seems to work pretty well for large sites.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@Moocar did you measure how much memory is used in these different benchmarks?
@DSchau same question?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@KyleAMathews not explicitly (although I can), but I used the upwards bounds of when it ran out of memory as the basis for which I considered the number of items. It is dependent on machine setup, size of items being cached, etc. too.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@DSchau Thanks for all the feedback! For caching, using loki would be a step up from using pure arrays, but an even bigger win would probably be to use a real caching library that can also provide persistence. Once that has LRU, paging to disk and other cache-specific semantics, assuming one exists! (I haven't looked)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@KyleAMathews Memory usage should be less since we're actually able to remove some caches. But I'll double check. What's your favorite memory profiling tool?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That's what I assume too but it's nice to know for sure.
There's probably something on npm but you can also add a setInterval in the code and log out the memory usage every .1 seconds or whatever. You could also log out different events too e.g. when query running starts etc. so you could visualize things easily.