345 lines
No EOL
10 KiB
JavaScript
345 lines
No EOL
10 KiB
JavaScript
|
|
import { writeFile, readFile } from "fs/promises";
|
|
import { rankCalc } from "./rank.js";
|
|
import * as cheerio from 'cheerio';
|
|
|
|
let cache = {};
|
|
let hh;
|
|
let percent = 0;
|
|
let noWorry = {};
|
|
|
|
async function urlCollector(url, path, file, useLimit, data2, settings) {
|
|
let { pageLimit } = settings;
|
|
let blacklist = (settings.blacklist + '').split(',');
|
|
let greyList = (settings.greyList + '').split(',');
|
|
|
|
let urls = data2[url] ? (data2[url][path] || []) : [];
|
|
urls = [...urls];
|
|
if (path != 'following') return urls;
|
|
if (noWorry[url]) return [...noWorry[url]]
|
|
|
|
let data;
|
|
try {
|
|
data = await fetch(url);
|
|
if (!data.headers.get("content-type").startsWith('text/html')) return urls;
|
|
data = await data.text();
|
|
console.log(`User ${url} was fetched`);
|
|
} catch (err) {
|
|
console.warn(`User ${url} did not fetch`);
|
|
return urls;
|
|
}
|
|
|
|
let body = cheerio.load(data);
|
|
let links = body('a');
|
|
let ll = urls.length;
|
|
let returnedFalse = false;
|
|
links.each(function (i, link) {
|
|
if (useLimit && urls.length >= pageLimit * 40 + ll) {
|
|
returnedFalse = true;
|
|
return false;
|
|
}
|
|
let h = body(link).attr('href');
|
|
if (!h) return true;
|
|
h = h.trim();
|
|
let isHash = h.startsWith('#');
|
|
let isQuery = h.startsWith('?');
|
|
if (h.startsWith('?') || isHash || h.startsWith('./') || h.startsWith('../') || h.startsWith('/')) {
|
|
let u = new URL(url);
|
|
if (isHash) u.search = h;
|
|
if (!isHash && !isQuery) u.pathname = h;
|
|
h = u.toString();
|
|
}
|
|
let h2;
|
|
try {
|
|
h2 = new URL(h);
|
|
} catch (err) {
|
|
|
|
}
|
|
if (!h2) return true;
|
|
if (settings.isRelative == 'relative' && h2.host != new URL(url).host) return true;
|
|
for (let g of greyList) {
|
|
if (h2.toString().includes(g) && g != '') return true;
|
|
}
|
|
if (blacklist.indexOf(h2.toString()) != -1) return true;
|
|
urls.push(h2.toString());
|
|
urls = [...new Set(urls)];
|
|
console.log(`User ${url} has ${urls.length} pages calculated`);
|
|
return true;
|
|
})
|
|
|
|
|
|
let h3;
|
|
try {
|
|
h3 = new URL(new URL(url).origin);
|
|
} catch (err) {
|
|
|
|
}
|
|
urls.push(h3.toString())
|
|
urls.push(url.split('?')[0]);
|
|
if (!data2[url]) data2[url] = { following: [], followers: [] };
|
|
data2[url][path] = [...new Set(urls)];
|
|
|
|
if (!returnedFalse) {
|
|
noWorry[url] = [...data2[url][path]];
|
|
}
|
|
|
|
return data2[url][path];
|
|
}
|
|
|
|
async function rounder(users, data, mode, settings) {
|
|
let p = [];
|
|
let congested = [];
|
|
let timeout = false;
|
|
let { delay, fetchRate, depth } = settings;
|
|
|
|
let pr = new Promise(resolve => setTimeout(function (...ag) {
|
|
timeout = true;
|
|
resolve(ag);
|
|
}, delay))
|
|
|
|
let ul = 0;
|
|
for (let u of users) {
|
|
let it = 0;
|
|
while (p.length >= (fetchRate)) {
|
|
p = p.filter(x => x != 'hi');
|
|
if (p.length == 0) break;
|
|
let pv = await Promise.any([...p, pr]);
|
|
|
|
if (timeout) {
|
|
console.warn(`Somehow, this is stuck.`);
|
|
break;
|
|
}
|
|
it++;
|
|
if (it > 10) {
|
|
console.warn(`Promises got congested. Moving to another array...`)
|
|
congested = congested.concat(p);
|
|
p = [];
|
|
break;
|
|
}
|
|
}
|
|
if (timeout) break;
|
|
if (!data[u]) data[u] = { followers: [], following: [] };
|
|
if (noWorry[u]) {
|
|
|
|
percent += 50 / depth / users.length;
|
|
console.log(`User ${u} followers was already fully calculated (${percent}% total)`);
|
|
continue;
|
|
}
|
|
p.push(async function (k) {
|
|
await siteCollector(u, 'followers', settings.site, mode, data, settings);
|
|
|
|
percent += 50 / depth / users.length;
|
|
console.log(`User ${u} followers is fully calculated (${percent}% total)`);
|
|
|
|
p[k] = 'hi';
|
|
}(p.length));
|
|
|
|
p.push(async function (k) {
|
|
await siteCollector(u, 'following', settings.site, mode, data, settings);
|
|
|
|
percent += 50 / depth / users.length;
|
|
console.log(`User ${u} following is fully calculated (${percent}% total)`);
|
|
|
|
p[k] = 'hi';
|
|
}(p.length));
|
|
}
|
|
|
|
if (!timeout) {
|
|
p = p.concat(congested);
|
|
await Promise.all(p);
|
|
} else {
|
|
console.warn('Took too long...')
|
|
}
|
|
|
|
let endn = [...users];
|
|
let oldLength = endn.length;
|
|
for (let h in data) {
|
|
endn.push(h);
|
|
if (endn.length > oldLength * 1.25) {
|
|
endn = [...new Set(endn)];
|
|
}
|
|
}
|
|
//let fcg = ((a, b) => ((data[a] ? -data[a].followers.length : 0) - (data[b] ? -data[b].followers.length : 0)));
|
|
//endn = endn.sort(fcg);
|
|
|
|
return endn;
|
|
}
|
|
|
|
async function textCollector(word, path, file, data2) {
|
|
if (!hh) hh = await readFile(file, 'utf8');
|
|
|
|
hh = hh.toLowerCase();
|
|
|
|
let words = hh.split(/[\n.]+/g).filter(x => word.length == 0 || ` ${x} `.includes(` ${word} `));
|
|
words = words.join(' ').split(/[^a-zA-Z0-9']+/g);
|
|
words = [...new Set(words)];
|
|
|
|
if (!data2[word]) data2[word] = { following: [], followers: [] };
|
|
data2[word][path] = words;
|
|
return words;
|
|
}
|
|
|
|
async function siteCollector(user, path, site, useLimit, data2, settings) {
|
|
let { route, pageLimit } = settings;
|
|
let users = [];
|
|
let urls = data2[user] ? (data2[user][path] || []) : [];
|
|
let ul = urls.length;
|
|
let i = 1 + (ul || 0);
|
|
let out = [];
|
|
|
|
if (site == 'file') {
|
|
return await textCollector(user, path, route, data2);
|
|
}
|
|
|
|
if (site == 'url') {
|
|
return await urlCollector(user, path, route, useLimit, data2, settings);
|
|
}
|
|
|
|
while (true) {
|
|
let p;
|
|
|
|
if (site == 'darflen') {
|
|
p = `https://api.darflen.com/users/${user}/${path}/${i}`;
|
|
} else if (site == 'scratch') {
|
|
p = `https://api.scratch.mit.edu/users/${user}/${path}/?limit=40&offset=${(i - 1) * 40}`;
|
|
} else {
|
|
throw 'That site is not supported.';
|
|
}
|
|
|
|
let j1 = cache[p];
|
|
if (!j1) {
|
|
let h1 = await fetch(p);
|
|
try {
|
|
j1 = await h1.json();
|
|
} catch (err) {
|
|
j1 = [];
|
|
}
|
|
}
|
|
cache[p] = j1;
|
|
|
|
if (!j1 || ((i - ul - 1) >= pageLimit && useLimit)) break;
|
|
|
|
let users2;
|
|
try {
|
|
if (site == 'darflen') {
|
|
users2 = j1[path].map(x => x.profile.username);
|
|
} else if (site == 'scratch') {
|
|
users2 = j1.map(x => x.username);
|
|
}
|
|
} catch (err) {
|
|
users2 = [];
|
|
}
|
|
|
|
users.push(users2);
|
|
|
|
if (users2.length == 0) break;
|
|
|
|
i++;
|
|
|
|
console.log(`User ${user} has ${i} pages calculated`);
|
|
}
|
|
out = out.concat(...users);
|
|
if (useLimit) {
|
|
out.length = Math.min(out.length, pageLimit * 40);
|
|
}
|
|
|
|
if (!data2[user]) data2[user] = { following: [], followers: [] };
|
|
data2[user][path] = out;
|
|
return out;
|
|
}
|
|
|
|
async function main(settings) {
|
|
let calcedRank = {};
|
|
let { site, discardThreshold, depth, user, matrixIterations, useArchive, } = settings;
|
|
user = user.split(',');
|
|
|
|
let users = [];
|
|
let data = {};
|
|
try {
|
|
let fil = await readFile(`./net_${btoa(penv[0])}.json`);
|
|
data = JSON.parse(fil);
|
|
console.log(`Archive found.`)
|
|
} catch (err) {
|
|
data = {};
|
|
}
|
|
|
|
users = await rounder(user, data, false, settings);
|
|
|
|
users = [...new Set(users)];
|
|
|
|
let dat;
|
|
for (let i = 0; i < depth; i++) {
|
|
if (i != 0) {
|
|
let tempSet = dat.map(x => x[0]);
|
|
let kk = Object.keys(data);
|
|
//kk = kk.sort((x, y) => ((new URL(x).host == new URL(penv[0]).host) ? 0 : 1) - ((new URL(y).host == new URL(penv[0]).host) ? 0 : 1));
|
|
console.log(kk)
|
|
let oldLength = kk.length;
|
|
let theData = {};
|
|
for (let a = 0; a < oldLength * discardThreshold && a < oldLength; a++) {
|
|
let key = kk[a];
|
|
theData[key] = data[key];
|
|
}
|
|
users = tempSet.concat(await rounder(tempSet, theData, true, settings));
|
|
users = [...new Set(users)];
|
|
}
|
|
for (let uf of users) {
|
|
let u = data[uf];
|
|
if (!u) {
|
|
data[uf] = { following: [], followers: [] }
|
|
}
|
|
let { following, followers } = u;
|
|
if (!following || !followers) continue;
|
|
|
|
try {
|
|
if (site == 'url') {
|
|
let o = new URL(new URL(uf).origin).toString();
|
|
|
|
if (followers.indexOf(o) == -1) {
|
|
followers.push(o)
|
|
}
|
|
}
|
|
} catch (err) {
|
|
}
|
|
|
|
for (let f of followers) {
|
|
if (!data[f]) data[f] = { followers: [] };
|
|
if (!data[f].following) data[f].following = [];
|
|
if (data[f].following.indexOf(uf) == -1) {
|
|
data[f].following.push(uf);
|
|
}
|
|
}
|
|
for (let f of following) {
|
|
if (!data[f]) data[f] = { following: [] };
|
|
if (!data[f].followers) data[f].followers = [];
|
|
if (data[f].followers.indexOf(uf) == -1) {
|
|
data[f].followers.push(uf);
|
|
}
|
|
}
|
|
}
|
|
console.log(`Graph is fully repaired`);
|
|
calcedRank = rankCalc(data, (i == depth - 1) ? matrixIterations : 3, user, site == 'url', settings.isGpu, settings.arrayMax, calcedRank)
|
|
dat = Object.entries(calcedRank);
|
|
dat = dat.sort((a, b) => b[1] - a[1]);
|
|
console.log(`Graph is calculated with ${dat.length} entries`);
|
|
let dat2 = {};
|
|
for (let d of dat) {
|
|
dat2[d[0]] = d[1] * 100 + "%";
|
|
}
|
|
|
|
let srz = JSON.stringify(dat2);
|
|
let ff = `./users_${i}_${btoa(user[0])}_${+new Date()}.json`;
|
|
await writeFile(ff, srz, 'utf8');
|
|
console.log(`Temporary file ${ff} is written`);
|
|
if (useArchive == 'use') {
|
|
ff = `./net_${btoa(user[0])}.json`;
|
|
await writeFile(ff, JSON.stringify(data), 'utf8');
|
|
console.log(`Temporary file ${ff} is written`);
|
|
}
|
|
}
|
|
|
|
console.log(`Graph is complete (${Object.keys(users).length} entries)`);
|
|
|
|
};
|
|
|
|
export { main, rankCalc }; |