admin管理员组文章数量:1278985
So I have downloaded the Wikidata JSON dump and it's about 90GB, too large to load into memory. It consists of a simple JSON structure like this:
[
item,
item,
item,
...
]
Each "item" looks something like this:
{
"type": "item",
"id": "Q23",
"labels": {
"<lang>": obj
},
"descriptions": {
"<lang>": {
"language": "<lang>",
"value": "<string>"
},
},
"aliases": {
"<key>": [
obj,
obj,
],
},
"claims": {
"<keyID>": [
{
"mainsnak": {
"snaktype": "value",
"property": "<keyID>",
"datavalue": {
"value": {
"entity-type": "<type>",
"numeric-id": <num>,
"id": "<id>"
},
"type": "wikibase-entityid"
},
"datatype": "wikibase-item"
},
"type": "statement",
"id": "<anotherId>",
"rank": "preferred",
"references": [
{
"hash": "<hash>",
"snaks": {
"<keyIDX>": [
{
"snaktype": "value",
"property": "P854",
"datavalue": obj,
"datatype": "url"
}
]
},
"snaks-order": [
"<propID>"
]
}
]
}
]
},
"sitelinks": {
"<lang>wiki": {
"site": "<lang>wiki",
"title": "<string>",
"badges": []
}
}
}
The JSON stream is configured like this:
const fs = require('fs')
const zlib = require('zlib')
const { parser } = require('stream-json')
let stream = fs.createReadStream('./wikidata/latest-all.json.gz')
stream
.pipe(zlib.createGunzip())
.pipe(parser())
.on('data', buildItem)
function buildItem(data) {
switch (data.name) {
case `startArray`:
break
case `startObject`:
break
case `startKey`:
break
case `stringChunk`:
break
case `endKey`:
break
case `keyValue`:
break
case `startString`:
break
case `endString`:
break
case `stringValue`:
break
case `endObject`:
break
case `endArray`:
break
}
}
Notice the buildItem
has the key information, it shows that the JSON stream emits objects like this (these are the logs):
{ name: 'startArray' }
{ name: 'startObject' }
{ name: 'startKey' }
{ name: 'startString' }
{ name: 'stringValue', value: 'type' }
{ name: 'endString' }
...
How do you parse this into item
objects like the above? Parsing this linear stream into a tree is very difficult to prehend.
A sample of output from the JSON stream is here, which you could use to test a parser if it helps.
So I have downloaded the Wikidata JSON dump and it's about 90GB, too large to load into memory. It consists of a simple JSON structure like this:
[
item,
item,
item,
...
]
Each "item" looks something like this:
{
"type": "item",
"id": "Q23",
"labels": {
"<lang>": obj
},
"descriptions": {
"<lang>": {
"language": "<lang>",
"value": "<string>"
},
},
"aliases": {
"<key>": [
obj,
obj,
],
},
"claims": {
"<keyID>": [
{
"mainsnak": {
"snaktype": "value",
"property": "<keyID>",
"datavalue": {
"value": {
"entity-type": "<type>",
"numeric-id": <num>,
"id": "<id>"
},
"type": "wikibase-entityid"
},
"datatype": "wikibase-item"
},
"type": "statement",
"id": "<anotherId>",
"rank": "preferred",
"references": [
{
"hash": "<hash>",
"snaks": {
"<keyIDX>": [
{
"snaktype": "value",
"property": "P854",
"datavalue": obj,
"datatype": "url"
}
]
},
"snaks-order": [
"<propID>"
]
}
]
}
]
},
"sitelinks": {
"<lang>wiki": {
"site": "<lang>wiki",
"title": "<string>",
"badges": []
}
}
}
The JSON stream is configured like this:
const fs = require('fs')
const zlib = require('zlib')
const { parser } = require('stream-json')
let stream = fs.createReadStream('./wikidata/latest-all.json.gz')
stream
.pipe(zlib.createGunzip())
.pipe(parser())
.on('data', buildItem)
function buildItem(data) {
switch (data.name) {
case `startArray`:
break
case `startObject`:
break
case `startKey`:
break
case `stringChunk`:
break
case `endKey`:
break
case `keyValue`:
break
case `startString`:
break
case `endString`:
break
case `stringValue`:
break
case `endObject`:
break
case `endArray`:
break
}
}
Notice the buildItem
has the key information, it shows that the JSON stream emits objects like this (these are the logs):
{ name: 'startArray' }
{ name: 'startObject' }
{ name: 'startKey' }
{ name: 'startString' }
{ name: 'stringValue', value: 'type' }
{ name: 'endString' }
...
How do you parse this into item
objects like the above? Parsing this linear stream into a tree is very difficult to prehend.
A sample of output from the JSON stream is here, which you could use to test a parser if it helps.
Share Improve this question edited Oct 28, 2020 at 20:32 Lance Pollard asked Oct 28, 2020 at 20:23 Lance PollardLance Pollard 79.5k98 gold badges330 silver badges607 bronze badges2 Answers
Reset to default 9 +250Using Built-In Functions (StreamArray)
stream-json
already has built-in functions that converts streams into objects (in this case, you're looking for StreamArray). You may want to use the built-in functions, as they've been coded with performance in mind.
To use it, it'd look something like:
const fs = require('fs')
const zlib = require('zlib')
const { parser } = require('stream-json')
const { streamArray } = require('stream-json/streamers/StreamArray')
let stream = fs.createReadStream('./wikidata/latest-all.json.gz')
stream
.pipe(zlib.createGunzip())
.pipe(parser())
.pipe(streamArray())
.on('data', d => processData(d.value))
function processData(data) {
console.log(data)
}
I remend taking a look at the wiki at https://github./uhop/stream-json/wiki for more information, as it has additional functions, particularly for filtering or transformation, which will probably be useful for you, especially if speed is a concern.
If I understood you correctly you want something like this. I used an ObjectBuilder
class that bines all methods to build one JSON object.
It uses parentStack
to keep track of all objects and arrays. When the object/array is started with startObject/startArray
a new JSON object/array is pushed onto the stack. Once this object/array is pleted it is popped off of the stack. The last object that is popped off of the stack is the whole item object and can be processed further (in the example below I just print it out).
The current object or array that is currently being constructed is always on top of the stack.
I had to use a subset of the sample you provided because it did not contain a matching number of startObject
and endObject
items, which resulted into an invalid JSON. I included this subset below the code.
Hopefully, this is what you were looking for :)
(Note, I only wrapped buildItem()
function in runSample()
function so that I can include the sample JSON at the bottom to make it look neater in this online editor. You can move buildItem()
function outside.)
class ObjectBuilder {
constructor() {
this.finalObject = undefined;
this.parentStack = [];
this.currentKey = undefined;
}
hasFinished() {
return this.finalObject !== undefined;
}
getFinalObject() {
return this.finalObject;
}
currentObject() {
return this.parentStack[this.parentStack.length - 1];
}
addValue(val) {
if (Array.isArray(this.currentObject())) {
this.currentObject().push(val);
}
else {
this.currentObject()[this.currentKey] = val;
this.currentKey = undefined;
}
}
processData(data) {
switch (data.name) {
case `startKey`:
case `endKey`:
case `startString`:
case `endString`:
case `stringChunk`:
// ignore, always followed by [something]Value
break;
case `keyValue`:
this.currentObject()[data.value] = undefined;
this.currentKey = data.value;
break;
case `numberValue`:
this.addValue(Number(data.value))
break
case `stringValue`:
this.addValue(data.value);
break;
case `startObject`:
let newObject = {};
if (this.parentStack.length === 0) {
// do nothing else, initialises first parent
}
else if (Array.isArray(this.currentObject())) {
this.currentObject().push(newObject);
}
else {
this.currentObject()[this.currentKey] = newObject;
}
this.parentStack.push(newObject);
this.currentKey = undefined;
break;
case `endObject`:
let parent = this.parentStack.pop();
if (this.parentStack.length === 0) {
this.finalObject = parent;
}
break;
case `startArray`:
let newArray = [];
if (Array.isArray(this.currentObject())) {
this.currentObject().push(newArray);
}
else {
this.currentObject()[this.currentKey] = newArray;
}
this.parentStack.push(newArray);
this.currentKey = undefined;
break;
case `endArray`:
this.parentStack.pop();
this.currentKey = undefined;
break;
}
}
}
function runSample(streamData) {
let currentlyProcessing = undefined;
function buildItem(data) {
if (currentlyProcessing === undefined && data.name === "endArray") {
return; // stream ended
}
if (currentlyProcessing === undefined) {
currentlyProcessing = new ObjectBuilder();
}
currentlyProcessing.processData(data);
if (currentlyProcessing.hasFinished()) {
// Finished building project; do something with it
let niceOutput = JSON.stringify(currentlyProcessing.getFinalObject(), null, 4);
console.log(niceOutput);
currentlyProcessing = undefined;
}
}
// simulate reading stream
for (let i = 0; i < streamData.length; ++i) {
if (i === 0) {
// Skip first chunk as it starts the array of items
continue;
}
buildItem(streamData[i]);
}
}
const streamData = [{"name": "startArray"},{"name": "startObject"},{"name": "startKey"},{"name": "stringChunk","value": "type"},{"name": "endKey"},{"name": "keyValue","value": "type"},{"name": "startString"},{"name": "stringChunk","value": "item"},{"name": "endString"},{"name": "stringValue","value": "item"},{"name": "startKey"},{"name": "stringChunk","value": "id"},{"name": "endKey"},{"name": "keyValue","value": "id"},{"name": "startString"},{"name": "stringChunk","value": "Q31"},{"name": "endString"},{"name": "stringValue","value": "Q31"},{"name": "startKey"},{"name": "stringChunk","value": "labels"},{"name": "endKey"},{"name": "keyValue","value": "labels"},{"name": "startObject"},{"name": "startKey"},{"name": "stringChunk","value": "el"},{"name": "endKey"},{"name": "keyValue","value": "el"},{"name": "startObject"},{"name": "startKey"},{"name": "stringChunk","value": "language"},{"name": "endKey"},{"name": "keyValue","value": "language"},{"name": "startString"},{"name": "stringChunk","value": "el"},{"name": "endString"},{"name": "stringValue","value": "el"},{"name": "startKey"},{"name": "stringChunk","value": "value"},{"name": "endKey"},{"name": "keyValue","value": "value"},{"name": "startString"},{"name": "stringChunk","value": "Β"},{"name": "stringChunk","value": "έ"},{"name": "stringChunk","value": "λ"},{"name": "stringChunk","value": "γ"},{"name": "stringChunk","value": "ι"},{"name": "stringChunk","value": "ο"},{"name": "endString"},{"name": "stringValue","value": "Βέλγιο"},{"name": "endObject"},{"name": "startKey"},{"name": "stringChunk","value": "ay"},{"name": "endKey"},{"name": "keyValue","value": "ay"},{"name": "startObject"},{"name": "startKey"},{"name": "stringChunk","value": "language"},{"name": "endKey"},{"name": "keyValue","value": "language"},{"name": "startString"},{"name": "stringChunk","value": "ay"},{"name": "endString"},{"name": "stringValue","value": "ay"},{"name": "startKey"},{"name": "stringChunk","value": "value"},{"name": "endKey"},{"name": "keyValue","value": "value"},{"name": "startString"},{"name": "stringChunk","value": "Bilkiya"},{"name": "endString"},{"name": "stringValue","value": "Bilkiya"},{"name": "endObject"},{"name": "startKey"},{"name": "stringChunk","value": "pnb"},{"name": "endKey"},{"name": "keyValue","value": "pnb"},{"name": "startObject"},{"name": "startKey"},{"name": "stringChunk","value": "language"},{"name": "endKey"},{"name": "keyValue","value": "language"},{"name": "startString"},{"name": "stringChunk","value": "pnb"},{"name": "endString"},{"name": "stringValue","value": "pnb"},{"name": "startKey"},{"name": "stringChunk","value": "value"},{"name": "endKey"},{"name": "keyValue","value": "value"},{"name": "startString"},{"name": "stringChunk","value": "ب"},{"name": "stringChunk","value": "ی"},{"name": "stringChunk","value": "ل"},{"name": "stringChunk","value": "ج"},{"name": "stringChunk","value": "ی"},{"name": "stringChunk","value": "م"},{"name": "endString"},{"name": "stringValue","value": "بیلجیم"},{"name": "endObject"},{"name": "endObject"},{"name": "endObject"},{"name": "startObject"},{"name": "startKey"},{"name": "stringChunk","value": "type"},{"name": "endKey"},{"name": "keyValue","value": "type"},{"name": "startString"},{"name": "stringChunk","value": "item"},{"name": "endString"},{"name": "stringValue","value": "item"},{"name": "startKey"},{"name": "stringChunk","value": "id"},{"name": "endKey"},{"name": "keyValue","value": "id"},{"name": "startString"},{"name": "stringChunk","value": "Q31"},{"name": "endString"},{"name": "stringValue","value": "Q31"},{"name": "startKey"},{"name": "stringChunk","value": "labels"},{"name": "endKey"},{"name": "keyValue","value": "labels"},{"name": "startObject"},{"name": "startKey"},{"name": "stringChunk","value": "el"},{"name": "endKey"},{"name": "keyValue","value": "el"},{"name": "startObject"},{"name": "startKey"},{"name": "stringChunk","value": "language"},{"name": "endKey"},{"name": "keyValue","value": "language"},{"name": "startString"},{"name": "stringChunk","value": "el"},{"name": "endString"},{"name": "stringValue","value": "el"},{"name": "startKey"},{"name": "stringChunk","value": "value"},{"name": "endKey"},{"name": "keyValue","value": "value"},{"name": "startString"},{"name": "stringChunk","value": "Β"},{"name": "stringChunk","value": "έ"},{"name": "stringChunk","value": "λ"},{"name": "stringChunk","value": "γ"},{"name": "stringChunk","value": "ι"},{"name": "stringChunk","value": "ο"},{"name": "endString"},{"name": "stringValue","value": "Βέλγιο"},{"name": "endObject"},{"name": "startKey"},{"name": "stringChunk","value": "ay"},{"name": "endKey"},{"name": "keyValue","value": "ay"},{"name": "startObject"},{"name": "startKey"},{"name": "stringChunk","value": "language"},{"name": "endKey"},{"name": "keyValue","value": "language"},{"name": "startString"},{"name": "stringChunk","value": "ay"},{"name": "endString"},{"name": "stringValue","value": "ay"},{"name": "startKey"},{"name": "stringChunk","value": "value"},{"name": "endKey"},{"name": "keyValue","value": "value"},{"name": "startString"},{"name": "stringChunk","value": "Bilkiya"},{"name": "endString"},{"name": "stringValue","value": "Bilkiya"},{"name": "endObject"},{"name": "startKey"},{"name": "stringChunk","value": "pnb"},{"name": "endKey"},{"name": "keyValue","value": "pnb"},{"name": "startObject"},{"name": "startKey"},{"name": "stringChunk","value": "language"},{"name": "endKey"},{"name": "keyValue","value": "language"},{"name": "startString"},{"name": "stringChunk","value": "pnb"},{"name": "endString"},{"name": "stringValue","value": "pnb"},{"name": "startKey"},{"name": "stringChunk","value": "value"},{"name": "endKey"},{"name": "keyValue","value": "value"},{"name": "startString"},{"name": "stringChunk","value": "ب"},{"name": "stringChunk","value": "ی"},{"name": "stringChunk","value": "ل"},{"name": "stringChunk","value": "ج"},{"name": "stringChunk","value": "ی"},{"name": "stringChunk","value": "م"},{"name": "endString"},{"name": "stringValue","value": "بیلجیم"},{"name": "endObject"},{"name": "startKey"},{"name": "stringChunk","value": "nestedArray"},{"name": "endKey"},{"name": "keyValue","value": "nestedArray"},{"name": "startArray"},{"name": "stringValue","value": "a"},{"name": "stringValue","value": "b"},{"name": "startArray"},{"name": "stringValue","value": "c"},{"name": "startObject"},{"name": "keyValue","value": "another object"},{"name": "stringValue","value": "d"},{"name": "endObject"},{"name": "stringValue","value": "e"},{"name": "endArray"},{"name": "stringValue","value": "b"},{"name": "endArray"},{"name": "endObject"},{"name": "endObject"},{"name": "endArray"}];
runSample(streamData);
本文标签: How to parse items from a large JSON stream in JavaScriptStack Overflow
版权声明:本文标题:How to parse items from a large JSON stream in JavaScript? - Stack Overflow 内容由网友自发贡献,该文观点仅代表作者本人, 转载请联系作者并注明出处:http://www.betaflare.com/web/1741252970a2366138.html, 本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容,一经查实,本站将立刻删除。
发表评论