Skip to content

Commit

Permalink
Fix the search, similar, list, and developer methods (#557)
Browse files Browse the repository at this point in the history
* Fix search script

* Fix app mapping - From PR#545

* Address comments on PR

* Fix mapping for  script

* Handle different sections in search results page

* Fix gplay.list method

* Fix mappings for price field

* Fix tests

* lint

* Fix the  method

* Address comments on code review

* Update lib/app.js

* Update lib/list.js

* Update test/lib.app.js

* Update test/lib.app.js

* Update lib.app.js

Co-authored-by: Facundo Olano <[email protected]>
  • Loading branch information
mmoksh and facundoolano authored Jun 23, 2022
1 parent 6ba483a commit e1e5439
Show file tree
Hide file tree
Showing 15 changed files with 1,518 additions and 1,197 deletions.
184 changes: 59 additions & 125 deletions lib/app.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,8 @@ const R = require('ramda');
const queryString = require('querystring');
const request = require('./utils/request');
const scriptData = require('./utils/scriptData');
const debug = require('debug')('google-play-scraper');
const cheerio = require('cheerio');
const { BASE_URL } = require('./constants');
const helper = require('./utils/mappingHelpers');

const PLAYSTORE_URL = `${BASE_URL}/store/apps/details`;

Expand Down Expand Up @@ -43,169 +42,104 @@ function app (opts) {
}

const MAPPINGS = {
// FIXME add appId
title: ['ds:5', 0, 0, 0],
title: ['ds:4', 1, 2, 0, 0],
description: {
path: ['ds:5', 0, 10, 0, 1],
fun: descriptionText
path: ['ds:4', 1, 2, 72, 0, 1],
fun: helper.descriptionText
},
descriptionHTML: ['ds:5', 0, 10, 0, 1],
summary: ['ds:5', 0, 10, 1, 1],
installs: ['ds:5', 0, 12, 9, 0],
minInstalls: ['ds:5', 0, 12, 9, 1],
maxInstalls: ['ds:5', 0, 12, 9, 2],
score: ['ds:6', 0, 6, 0, 1],
scoreText: ['ds:6', 0, 6, 0, 0],
ratings: ['ds:6', 0, 6, 2, 1],
reviews: ['ds:6', 0, 6, 3, 1],
descriptionHTML: ['ds:4', 1, 2, 72, 0, 1],
summary: ['ds:4', 1, 2, 73, 0, 1],
installs: ['ds:4', 1, 2, 13, 0],
minInstalls: ['ds:4', 1, 2, 13, 1],
maxInstalls: ['ds:4', 1, 2, 13, 2],
score: ['ds:4', 1, 2, 51, 0, 1],
scoreText: ['ds:4', 1, 2, 51, 0, 0],
ratings: ['ds:4', 1, 2, 51, 2, 1],
reviews: ['ds:4', 1, 2, 51, 3, 1],
histogram: {
path: ['ds:6', 0, 6, 1],
fun: buildHistogram
path: ['ds:4', 1, 2, 51, 1],
fun: helper.buildHistogram
},

price: {
path: ['ds:3', 0, 2, 0, 0, 0, 1, 0, 0],
path: ['ds:4', 1, 2, 57, 0, 0, 0, 0, 1, 0, 0],
fun: (val) => val / 1000000 || 0
},
free: {
path: ['ds:3', 0, 2, 0, 0, 0, 1, 0, 0],
path: ['ds:4', 1, 2, 57, 0, 0, 0, 0, 1, 0, 0],
// considered free only if price is exactly zero
fun: (val) => val === 0
},
currency: ['ds:3', 0, 2, 0, 0, 0, 1, 0, 1],
currency: ['ds:4', 1, 2, 57, 0, 0, 0, 0, 1, 0, 1],
priceText: {
path: ['ds:3', 0, 2, 0, 0, 0, 1, 0, 2],
fun: priceText
path: ['ds:4', 1, 2, 57, 0, 0, 0, 0, 1, 0, 2],
fun: helper.priceText
},
available: {
path: ['ds:5', 0, 12, 11, 0],
path: ['ds:4', 1, 2, 18, 0],
fun: Boolean
},
offersIAP: {
path: ['ds:5', 0, 12, 12, 0],
path: ['ds:4', 1, 2, 19, 0],
fun: Boolean
},
IAPRange: ['ds:5', 0, 12, 12, 0],
size: ['ds:8', 0],
IAPRange: ['ds:4', 1, 2, 19, 0],
androidVersion: {
path: ['ds:8', 2],
fun: normalizeAndroidVersion
path: ['ds:4', 1, 2, 140, 1, 1, 0, 0, 1],
fun: helper.normalizeAndroidVersion
},
androidVersionText: {
path: ['ds:4', 1, 2, 140, 1, 1, 0, 0, 1],
fun: (version) => version || 'Varies with device'
},
androidVersionText: ['ds:8', 2],
developer: ['ds:5', 0, 12, 5, 1],
developer: ['ds:4', 1, 2, 68, 0],
developerId: {
path: ['ds:5', 0, 12, 5, 5, 4, 2],
path: ['ds:4', 1, 2, 68, 1, 4, 2],
fun: (devUrl) => devUrl.split('id=')[1]
},
developerEmail: ['ds:4', 1, 2, 69, 1, 0],
developerWebsite: ['ds:4', 1, 2, 69, 0, 5, 2],
developerAddress: ['ds:4', 1, 2, 69, 2, 0],
privacyPolicy: ['ds:4', 1, 2, 99, 0, 5, 2],
developerInternalID: {
path: ['ds:4', 1, 2, 68, 1, 4, 2],
fun: (devUrl) => devUrl.split('id=')[1]
},
developerEmail: ['ds:5', 0, 12, 5, 2, 0],
developerWebsite: ['ds:5', 0, 12, 5, 3, 5, 2],
developerAddress: ['ds:5', 0, 12, 5, 4, 0],
privacyPolicy: ['ds:5', 0, 12, 7, 2],
developerInternalID: ['ds:5', 0, 12, 5, 0, 0],
genre: ['ds:5', 0, 12, 13, 0, 0],
genreId: ['ds:5', 0, 12, 13, 0, 2],
genre: ['ds:4', 1, 2, 79, 0, 0, 0],
genreId: ['ds:4', 1, 2, 79, 0, 0, 2],
familyGenre: ['ds:5', 0, 12, 13, 1, 0],
familyGenreId: ['ds:5', 0, 12, 13, 1, 2],
icon: ['ds:5', 0, 12, 1, 3, 2],
headerImage: ['ds:5', 0, 12, 2, 3, 2],
icon: ['ds:4', 1, 2, 95, 0, 3, 2],
headerImage: ['ds:4', 1, 2, 96, 0, 3, 2],
screenshots: {
path: ['ds:5', 0, 12, 0],
path: ['ds:4', 1, 2, 78, 0],
fun: (screenshots) => {
if (screenshots === null) return [];
return screenshots.map(R.path([3, 2]));
}
},
video: ['ds:5', 0, 12, 3, 0, 3, 2],
videoImage: ['ds:5', 0, 12, 3, 1, 3, 2],
contentRating: ['ds:5', 0, 12, 4, 0],
contentRatingDescription: ['ds:5', 0, 12, 4, 2, 1],
video: ['ds:4', 1, 2, 100, 0, 0, 3, 2],
videoImage: ['ds:4', 1, 2, 100, 1, 0, 3, 2],
contentRating: ['ds:4', 1, 2, 9, 0],
contentRatingDescription: ['ds:4', 1, 2, 9, 2, 1],
adSupported: {
path: ['ds:5', 0, 12, 14, 0],
path: ['ds:4', 1, 2, 48],
fun: Boolean
},
released: ['ds:5', 0, 12, 36],
released: ['ds:4', 1, 2, 10, 0],
updated: {
path: ['ds:5', 0, 12, 8, 0],
path: ['ds:4', 1, 2, 145, 0, 1, 0],
fun: (ts) => ts * 1000
},
version: ['ds:8', 1],
recentChanges: ['ds:5', 0, 12, 6, 1],
comments: {
useServiceRequestId: 'UsvDTd',
path: [0],
fun: extractComments
},
editorsChoice: {
path: ['ds:5', 0, 12, 15, 0],
fun: Boolean
version: {
path: ['ds:4', 1, 2, 140, 0, 0, 0],
fun: (val) => val || 'VARY'
},
features: {
path: ['ds:5', 0, 12, 16],
fun: extractFeatures
recentChanges: ['ds:4', 1, 2, 144, 1, 1],
comments: {
path: ['ds:8', 0],
isArray: true,
fun: helper.extractComments
}
};

function extractFeatures (featuresArray) {
if (featuresArray === null) {
return [];
}

const features = featuresArray[2] || [];

return features.map(feature => ({
title: feature[0],
description: R.path([1, 0, 0, 1], feature)
}));
}

function descriptionText (description) {
// preserve the line breaks when converting to text
const html = cheerio.load('<div>' + description.replace(/<br>/g, '\r\n') + '</div>');
return html('div').text();
}

function priceText (priceText) {
return priceText || 'Free';
}

function normalizeAndroidVersion (androidVersionText) {
const number = androidVersionText.split(' ')[0];
if (parseFloat(number)) {
return number;
}

return 'VARY';
}

function buildHistogram (container) {
if (!container) {
return { 1: 0, 2: 0, 3: 0, 4: 0, 5: 0 };
}

return {
1: container[1][1],
2: container[2][1],
3: container[3][1],
4: container[4][1],
5: container[5][1]
};
}

/**
* Extract the comments from google play script array
* @param {array} comments The comments array
*/
function extractComments (comments) {
if (!comments) {
return [];
}

debug('comments: %O', comments);

return R.compose(
R.take(5),
R.reject(R.isNil),
R.pluck(4))(comments);
}

module.exports = app;
10 changes: 1 addition & 9 deletions lib/constants.js
Original file line number Diff line number Diff line change
Expand Up @@ -67,15 +67,7 @@ module.exports.category = {
module.exports.collection = {
TOP_FREE: 'TOP_FREE',
TOP_PAID: 'TOP_PAID',
GROSSING: 'GROSSING',
TOP_FREE_GAMES: 'TOP_FREE_GAMES',
TOP_PAID_GAMES: 'TOP_PAID_GAMES',
TOP_GROSSING_GAMES: 'TOP_GROSSING_GAMES',
TRENDING: 'TRENDING',
NEW_FREE: 'NEW_FREE',
NEW_PAID: 'NEW_PAID',
NEW_FREE_GAMES: 'NEW_FREE_GAMES',
NEW_PAID_GAMES: 'NEW_PAID_GAMES'
GROSSING: 'GROSSING'
};

module.exports.sort = {
Expand Down
100 changes: 67 additions & 33 deletions lib/developer.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,12 @@

const debug = require('debug')('google-play-scraper:developer');
const qs = require('querystring');
const url = require('url');
const scriptData = require('./utils/scriptData');
const { BASE_URL } = require('./constants');
const request = require('./utils/request');
const R = require('ramda');
const processPages = require('./utils/processPages');
const { checkFinished, processFullDetailApps } = require('./utils/processPages');

function buildUrl (opts) {
const { lang, devId, country } = opts;
Expand Down Expand Up @@ -54,40 +55,73 @@ function developer (opts) {
});
}

const INITIAL_MAPPINGS = {
cluster: ['ds:3', 0, 1, 0, 0, 3, 4, 2],
apps: ['ds:3', 0, 1, 0, 0, 0],
token: ['ds:3', 0, 1, 0, 0, 7, 1]
};

function parseDeveloperApps (developerObject, opts) {
const clusterUrl = opts.hasClusterUrl
? getParsedCluster(developerObject)
: undefined;

if (!clusterUrl) {
return processPages(developerObject, opts, [], INITIAL_MAPPINGS);
async function parseDeveloperApps (html, opts) {
if (R.is(String, html)) {
html = scriptData.parse(html);
}

const clusterUrlToProcess = `${BASE_URL}${clusterUrl}&hl=${opts.lang}&gl=${opts.country}`;

debug('Cluster Request URL: %s', clusterUrlToProcess);

const options = Object.assign({
url: clusterUrlToProcess,
method: 'GET',
followRedirect: true
}, opts.requestOptions);

return request(options, opts.throttle)
.then(scriptData.parse)
.then(clusterObject => processPages(clusterObject, opts, [], INITIAL_MAPPINGS))
.catch(console.error);
}

function getParsedCluster (developerObject) {
const clusterUrl = R.path(INITIAL_MAPPINGS.cluster, developerObject);
return clusterUrl;
const initialMappings = isNaN(opts.devId)
? {
apps: ['ds:3', 0, 1, 0, 22, 0],
token: ['ds:3', 0, 1, 0, 22, 1, 3, 1]
} : {
apps: ['ds:3', 0, 1, 0, 21, 0],
token: ['ds:3', 0, 1, 0, 21, 1, 3, 1]
};

const appsMappings = isNaN(opts.devId)
? {
title: [0, 3],
appId: [0, 0, 0],
url: {
path: [0, 10, 4, 2],
fun: (path) => new url.URL(path, BASE_URL).toString()
},
icon: [0, 1, 3, 2],
developer: [0, 14],
currency: [0, 8, 1, 0, 1],
price: {
path: [0, 8, 1, 0, 0],
fun: (price) => price / 1000000
},
free: {
path: [0, 8, 1, 0, 0],
fun: (price) => price === 0
},
summary: [0, 13, 1],
scoreText: [0, 4, 0],
score: [0, 4, 1]
} : {
title: [3],
appId: [0, 0],
url: {
path: [10, 4, 2],
fun: (path) => new url.URL(path, BASE_URL).toString()
},
icon: [1, 3, 2],
developer: [14],
currency: [8, 1, 0, 1],
price: {
path: [8, 1, 0, 0],
fun: (price) => price / 1000000
},
free: {
path: [8, 1, 0, 0],
fun: (price) => price === 0
},
summary: [13, 1],
scoreText: [4, 0],
score: [4, 1]
};

const processedApps = R.map(scriptData.extractor(appsMappings), R.path(initialMappings.apps, html));
const apps = opts.fullDetail
? await processFullDetailApps(processedApps, opts)
: processedApps;

const token = R.path(initialMappings.token, html);

return checkFinished(opts, apps, token);
}

module.exports = developer;
153 changes: 59 additions & 94 deletions lib/list.js

Large diffs are not rendered by default.

Loading

0 comments on commit e1e5439

Please sign in to comment.