Assign feed item's categories to post.

Changed htmlparser2 to node-feedparser
This commit is contained in:
Delgermurun 2015-03-16 20:02:33 +08:00
parent 6f9e11bfe5
commit a05fa134e4
4 changed files with 136 additions and 99 deletions

View file

@ -1,22 +1,19 @@
{
"dependencies": {
"he": {
"version": "0.5.0"
},
"htmlparser2": {
"version": "3.8.2",
"feedparser": {
"version": "1.0.0",
"dependencies": {
"domhandler": {
"version": "2.3.0"
"sax": {
"version": "0.6.1"
},
"domutils": {
"version": "1.5.0"
"addressparser": {
"version": "0.1.3"
},
"domelementtype": {
"version": "1.1.3"
"array-indexofobject": {
"version": "0.0.1"
},
"readable-stream": {
"version": "1.1.13",
"version": "1.0.33",
"dependencies": {
"core-util-is": {
"version": "1.0.1"
@ -31,11 +28,11 @@
"version": "2.0.1"
}
}
}
}
},
"entities": {
"version": "1.0.0"
}
}
"he": {
"version": "0.5.0"
},
"to-markdown": {
"version": "0.0.2"

View file

@ -13,7 +13,7 @@ var feedSchema = new SimpleSchema({
return {
value: user._id,
label: getDisplayName(user)
}
};
});
return users;
}
@ -32,7 +32,7 @@ var feedSchema = new SimpleSchema({
return {
value: category._id,
label: category.name
}
};
});
return categories;
}
@ -40,7 +40,7 @@ var feedSchema = new SimpleSchema({
}
});
Feeds = new Meteor.Collection("feeds");
Feeds = new Meteor.Collection('feeds');
Feeds.attachSchema(feedSchema);
// used to keep track of which feed a post was imported from

View file

@ -1,43 +1,87 @@
var htmlParser = Npm.require('htmlparser2');
var toMarkdown = Npm.require('to-markdown').toMarkdown;
var he = Npm.require('he')
var he = Npm.require('he');
var FeedParser = Npm.require('feedparser');
var Readable = Npm.require('stream').Readable;
var getFirstAdminUser = function() {
return Meteor.users.findOne({isAdmin: true}, {sort: {createdAt: 1}});
};
var feedHandler = {
getStream: function(content) {
var stream = new Readable();
stream.push(content);
stream.push(null);
return stream;
},
getItemCategories: function(categories, item) {
var itemCategories = [],
category;
if (item.categories && item.categories.length > 0) {
item.categories.forEach(function(name) {
category = Categories.findOne({name: name}, {fields: {_id: 1}});
if (category) {
itemCategories.push(category._id);
}
});
}
var handleFeed = function(error, feed) {
if (error) return;
if (itemCategories.length > 0) {
if (categories) {
itemCategories = itemCategories.concat(categories);
}
} else {
itemCategories = categories;
}
var feedItems = _.first(feed.items, 20); // limit feed to 20 items just in case
var userId = this._parser._options.userId;
var feedId = this._parser._options.feedId;
var categories = this._parser._options.categories;
return itemCategories;
},
clog('// Parsing RSS feed: '+ feed.title)
handle: function(content, userId, categories, feedId) {
var stream = this.getStream(content),
feedParser = new FeedParser(),
newItemsCount = 0,
self = this;
var newItemsCount = 0;
stream.pipe(feedParser);
feedItems.forEach(function(item, index, array) {
feedParser.on('meta', Meteor.bindEnvironment(function(meta) {
clog('// Parsing RSS feed: '+ meta.title);
}));
// if item has no id, use the URL to give it one
if (!item.id)
item.id = item.link;
feedParser.on('error', Meteor.bindEnvironment(function(error) {
clog(error);
}));
feedParser.on('readable', Meteor.bindEnvironment(function() {
var s = this, item;
while (item = s.read()) {
// if item has no guid, use the URL to give it one
if (!item.guid) {
item.guid = item.link;
}
// check if post already exists
if (!!Posts.findOne({feedItemId: item.id})) {
// clog('// Feed item already imported')
} else {
if (!!Posts.findOne({feedItemId: item.guid})) {
// clog('// Feed item already imported');
continue;
}
newItemsCount++;
var post = {
title: he.decode(item.title),
url: item.link,
feedId: feedId,
feedItemId: item.id,
feedItemId: item.guid,
userId: userId,
categories: categories
}
categories: self.getItemCategories(categories, item)
};
if (item.description)
post.body = toMarkdown(he.decode(item.description));
@ -46,13 +90,14 @@ var handleFeed = function(error, feed) {
// if RSS item link is a 301 or 302 redirect, follow the redirect
var get = HTTP.get(item.link, {followRedirects: false});
if (!!get.statusCode && (get.statusCode === 301 || get.statusCode === 302) && !!get.headers && !!get.headers.location) {
if (!!get.statusCode && (get.statusCode === 301 || get.statusCode === 302) &&
!!get.headers && !!get.headers.location) {
post.url = get.headers.location;
}
// if RSS item has a date, use it
if (item.pubDate)
post.postedAt = moment(item.pubDate).toDate();
if (item.pubdate)
post.postedAt = moment(item.pubdate).toDate();
try {
submitPost(post);
@ -60,14 +105,16 @@ var handleFeed = function(error, feed) {
// catch errors so they don't stop the loop
clog(error);
}
}
});
clog('// Found ' + newItemsCount + ' new feed items')
// clog('// Found ' + newItemsCount + ' new feed items');
}, function() {
clog('Failed to bind environment');
}, feedParser));
}
};
fetchFeeds = function() {
var fetchFeeds = function() {
var content;
Feeds.find().forEach(function(feed) {
@ -78,21 +125,14 @@ fetchFeeds = function() {
var feedId = feed._id;
try {
content = HTTP.get(feed.url).content;
var feedHandler = new htmlParser.FeedHandler(handleFeed);
var parser = new htmlParser.Parser(feedHandler, {xmlMode: true, userId: userId, categories:categories, feedId:feedId});
parser.write(content);
parser.end();
feedHandler.handle(content, userId, categories, feedId);
} catch (error) {
console.log(error);
return true; // just go to next url
}
});
}
};
Meteor.methods({
fetchFeeds: function () {

View file

@ -5,7 +5,7 @@ Package.describe({
});
Npm.depends({
'htmlparser2': '3.8.2',
'feedparser': '1.0.0',
'to-markdown': '0.0.2',
'he': '0.5.0'
});