admin管理员组

文章数量:1327102

Currently I need to push a large CSV file into a mongo DB and the order of the values needs to determine the key for the DB entry:

Example CSV file:

9,1557,358,286,Mutantville,4368,2358026,,M,0,0,0,1,0
9,1557,359,147,Wroogny,4853,2356061,,D,0,0,0,1,0

Code to parse it into arrays:

var fs = require("fs");

var csv = require("fast-csv");

fs.createReadStream("rank.txt")
    .pipe(csv())
    .on("data", function(data){
        console.log(data);
    })
    .on("end", function(data){
        console.log("Read Finished");
    });

Code Output:

[ '9',
  '1557',
  '358',
  '286',
  'Mutantville',
  '4368',
  '2358026',
  '',
  'M',
  '0',
  '0',
  '0',
  '1',
  '0' ]
[ '9',
  '1557',
  '359',
  '147',
  'Wroogny',
  '4853',
  '2356061',
  '',
  'D',
  '0',
  '0',
  '0',
  '1',
  '0' ]

How do I insert the arrays into my mongoose schema to go into mongo db?

Schema:

var mongoose = require("mongoose");


var rankSchema = new mongoose.Schema({
   serverid: Number,
   resetid: Number,
   rank: Number,
   number: Number,
   name: String,
   land: Number,
   networth: Number,
   tag: String,
   gov: String,
   gdi: Number,
   protection: Number,
   vacation: Number,
   alive: Number,
   deleted: Number
});

module.exports = mongoose.model("Rank", rankSchema);

The order of the array needs to match the order of the schema for instance in the array the first number 9 needs to always be saved as they key "serverid" and so forth. I'm using Node.JS

Currently I need to push a large CSV file into a mongo DB and the order of the values needs to determine the key for the DB entry:

Example CSV file:

9,1557,358,286,Mutantville,4368,2358026,,M,0,0,0,1,0
9,1557,359,147,Wroogny,4853,2356061,,D,0,0,0,1,0

Code to parse it into arrays:

var fs = require("fs");

var csv = require("fast-csv");

fs.createReadStream("rank.txt")
    .pipe(csv())
    .on("data", function(data){
        console.log(data);
    })
    .on("end", function(data){
        console.log("Read Finished");
    });

Code Output:

[ '9',
  '1557',
  '358',
  '286',
  'Mutantville',
  '4368',
  '2358026',
  '',
  'M',
  '0',
  '0',
  '0',
  '1',
  '0' ]
[ '9',
  '1557',
  '359',
  '147',
  'Wroogny',
  '4853',
  '2356061',
  '',
  'D',
  '0',
  '0',
  '0',
  '1',
  '0' ]

How do I insert the arrays into my mongoose schema to go into mongo db?

Schema:

var mongoose = require("mongoose");


var rankSchema = new mongoose.Schema({
   serverid: Number,
   resetid: Number,
   rank: Number,
   number: Number,
   name: String,
   land: Number,
   networth: Number,
   tag: String,
   gov: String,
   gdi: Number,
   protection: Number,
   vacation: Number,
   alive: Number,
   deleted: Number
});

module.exports = mongoose.model("Rank", rankSchema);

The order of the array needs to match the order of the schema for instance in the array the first number 9 needs to always be saved as they key "serverid" and so forth. I'm using Node.JS

Share Improve this question edited May 15, 2018 at 7:30 Neil Lunn 151k36 gold badges355 silver badges325 bronze badges asked May 15, 2018 at 5:50 LogiLogi 1153 silver badges15 bronze badges 1
  • You cannot insert an "array", and you need to convert it to an "object". Usually you would know what the fields should be, either as predefined or by a headerline within the CSV itself. Most CSV parsers will in fact support that. – Neil Lunn Commented May 15, 2018 at 5:53
Add a ment  | 

2 Answers 2

Reset to default 4

You can do it with fast-csv by getting the headers from the schema definition which will return the parsed lines as "objects". You actually have some mismatches, so I've marked them with corrections:

const fs = require('mz/fs');
const csv = require('fast-csv');

const { Schema } = mongoose = require('mongoose');

const uri = 'mongodb://localhost/test';

mongoose.Promise = global.Promise;
mongoose.set('debug', true);

const rankSchema = new Schema({
  serverid: Number,
  resetid: Number,
  rank: Number,
  name: String,
  land: String,         // <-- You have this as Number but it's a string
  networth: Number,
  tag: String,
  stuff: String,        // the empty field in the csv
  gov: String,
  gdi: Number,
  protection: Number,
  vacation: Number,
  alive: Number,
  deleted: Number
});

const Rank = mongoose.model('Rank', rankSchema);

const log = data => console.log(JSON.stringify(data, undefined, 2));

(async function() {

  try {
    const conn = await mongoose.connect(uri);

    await Promise.all(Object.entries(conn.models).map(([k,m]) => m.remove()));

    let headers = Object.keys(Rank.schema.paths)
      .filter(k => ['_id','__v'].indexOf(k) === -1);

    console.log(headers);

    await new Promise((resolve,reject) => {

      let buffer = [],
          counter = 0;

      let stream = fs.createReadStream('input.csv')
        .pipe(csv({ headers }))
        .on("error", reject)
        .on("data", async doc => {
          stream.pause();
          buffer.push(doc);
          counter++;
          log(doc);
          try {
            if ( counter > 10000 ) {
              await Rank.insertMany(buffer);
              buffer = [];
              counter = 0;
            }
          } catch(e) {
            stream.destroy(e);
          }

          stream.resume();

        })
        .on("end", async () => {
          try {
            if ( counter > 0 ) {
              await Rank.insertMany(buffer);
              buffer = [];
              counter = 0;
              resolve();
            }
          } catch(e) {
            stream.destroy(e);
          }
        });

    });


  } catch(e) {
    console.error(e)
  } finally {
    process.exit()
  }


})()

As long as the schema actually lines up to the provided CSV then it's okay. These are the corrections that I can see but if you need the actual field names aligned differently then you need to adjust. But there was basically a Number in the position where there is a String and essentially an extra field, which I'm presuming is the blank one in the CSV.

The general things are getting the array of field names from the schema and passing that into the options when making the csv parser instance:

let headers = Object.keys(Rank.schema.paths)
  .filter(k => ['_id','__v'].indexOf(k) === -1);

let stream = fs.createReadStream('input.csv')
  .pipe(csv({ headers }))

Once you actually do that then you get an "Object" back instead of an array:

{
  "serverid": "9",
  "resetid": "1557",
  "rank": "358",
  "name": "286",
  "land": "Mutantville",
  "networth": "4368",
  "tag": "2358026",
  "stuff": "",
  "gov": "M",
  "gdi": "0",
  "protection": "0",
  "vacation": "0",
  "alive": "1",
  "deleted": "0"
}

Don't worry about the "types" because Mongoose will cast the values according to schema.

The rest happens within the handler for the data event. For maximum efficiency we are using insertMany() to only write to the database once every 10,000 lines. How that actually goes to the server and processes depends on the MongoDB version, but 10,000 should be pretty reasonable based on the average number of fields you would import for a single collection in terms of the "trade-off" for memory usage and writing a reasonable network request. Make the number smaller if necessary.

The important parts are to mark these calls as async functions and await the result of the insertMany() before continuing. Also we need to pause() the stream and resume() on each item otherwise we run the risk of overwriting the buffer of documents to insert before they are actually sent. The pause() and resume() are necessary to put "back-pressure" on the pipe, otherwise items just keep "ing out" and firing the data event.

Naturally the control for the 10,000 entries requires we check that both on each iteration and on stream pletion in order to empty the buffer and send any remaining documents to the server.

That's really what you want to do, as you certainly don't want to fire off an async request to the server both on "every" iteration through the data event or essentially without waiting for each request to plete. You'll get away with not checking that for "very small files", but for any real world load you're certain to exceed the call stack due to "in flight" async calls which have not yet pleted.


FYI - a package.json used. The mz is optional as it's just a modernized Promise enabled library of standard node "built-in" libraries that I'm simply used to using. The code is of course pletely interchangeable with the fs module.

{
  "description": "",
  "main": "index.js",
  "dependencies": {
    "fast-csv": "^2.4.1",
    "mongoose": "^5.1.1",
    "mz": "^2.7.0"
  },
  "keywords": [],
  "author": "",
  "license": "ISC"
}

Actually with Node v8.9.x and above then we can even make this much simpler with an implementation of AsyncIterator through the stream-to-iterator module. It's still in Iterator<Promise<T>> mode, but it should do until Node v10.x bees stable LTS:

const fs = require('mz/fs');
const csv = require('fast-csv');
const streamToIterator = require('stream-to-iterator');

const { Schema } = mongoose = require('mongoose');

const uri = 'mongodb://localhost/test';

mongoose.Promise = global.Promise;
mongoose.set('debug', true);

const rankSchema = new Schema({
  serverid: Number,
  resetid: Number,
  rank: Number,
  name: String,
  land: String,
  networth: Number,
  tag: String,
  stuff: String,        // the empty field
  gov: String,
  gdi: Number,
  protection: Number,
  vacation: Number,
  alive: Number,
  deleted: Number
});

const Rank = mongoose.model('Rank', rankSchema);

const log = data => console.log(JSON.stringify(data, undefined, 2));

(async function() {

  try {
    const conn = await mongoose.connect(uri);

    await Promise.all(Object.entries(conn.models).map(([k,m]) => m.remove()));

    let headers = Object.keys(Rank.schema.paths)
      .filter(k => ['_id','__v'].indexOf(k) === -1);

    //console.log(headers);

    let stream = fs.createReadStream('input.csv')
      .pipe(csv({ headers }));

    const iterator = await streamToIterator(stream).init();

    let buffer = [],
        counter = 0;

    for ( let docPromise of iterator ) {
      let doc = await docPromise;
      buffer.push(doc);
      counter++;

      if ( counter > 10000 ) {
        await Rank.insertMany(buffer);
        buffer = [];
        counter = 0;
      }
    }

    if ( counter > 0 ) {
      await Rank.insertMany(buffer);
      buffer = [];
      counter = 0;
    }

  } catch(e) {
    console.error(e)
  } finally {
    process.exit()
  }

})()

Basically, all of the stream "event" handling and pausing and resuming gets replaced by a simple for loop:

const iterator = await streamToIterator(stream).init();

for ( let docPromise of iterator ) {
  let doc = await docPromise;
  // ... The things in the loop
}

Easy! This gets cleaned up in later node implementation with for..await..of when it bees more stable. But the above runs fine on the from the specified version and above.

By saying @Neil Lunn need headerline within the CSV itself.

Example using csvtojson module.

const csv = require('csvtojson');

const csvArray = [];
  csv()
    .fromFile(file-path)
    .on('json', (jsonObj) => {
      csvArray.push({ name: jsonObj.name, id: jsonObj.id });
    })
    .on('done', (error) => {
      if (error) {
        return res.status(500).json({ error});
      }
          Model.create(csvArray)
      .then((result) => {
         return res.status(200).json({result});
      }).catch((err) => {
          return res.status(500).json({ error});
      });
      });
    });

本文标签: javascriptImport CSV Using Mongoose SchemaStack Overflow