bigly-caret/site.js

252 lines
6.9 KiB
JavaScript

import { writeFile, readFile } from "fs/promises";
import { rankCalc } from "./rank.js";
import * as cheerio from 'cheerio';
let cache = {};
let site = process.env.site || 'darflen';
let route = process.env.route || './test.txt';
let pageLimit = process.env.pageLimit || Infinity;
let blacklist = (process.env.blacklist + '').split(',') || [];
let greylist = (process.env.greylist + '').split(',') || [];
let hh;
async function urlCollector(url, path, file, useLimit, data2) {
if (path != 'following') return [];
let data;
try {
data = await fetch(url);
data = await data.text();
console.log(`User ${url} was fetched`);
} catch (err) {
console.log(`User ${url} did not fetch`);
return [];
}
let body = cheerio.load(data);
let links = body('a');
let urls = data2[url] ? (data2[url][path] || []) : [];
let ll = urls.length;
links.each(function (i, link) {
if (useLimit && urls.length >= pageLimit * 40 + ll) return;
let h = body(link).attr('href');
if (!h) return;
h = h.trim();
if (h.startsWith('./') || h.startsWith('../') || h.startsWith('/')) {
let u = new URL(url);
u.pathname = h;
h = u.toString();
}
let h2;
try {
h2 = new URL(h);
} catch (err) {
return;
}
for (let g of greylist) {
if (h2.toString().includes(g) && g != '') return;
}
if (blacklist.indexOf(h2.toString()) != -1) return;
urls.push(h2.toString());
urls = [...new Set(urls)];
console.log(`User ${url} has ${urls.length} pages calculated`);
})
let h3;
try {
h3 = new URL(new URL(url).origin);
} catch (err) {
}
urls.push(h3.toString())
if (!data2[url]) data2[url] = { following: [], followers: [] };
data2[url][path] = urls;
return [...new Set(urls)];
}
async function textCollector(word, path, file) {
if (!hh) hh = await readFile(file, 'utf8');
hh = hh.toLowerCase();
let words = hh.split(/[\n.]+/g).filter(x => word.length == 0 || ` ${x} `.includes(` ${word} `));
words = words.join(' ').split(/[^a-zA-Z0-9']+/g);
words = [...new Set(words)];
if (!data2[word]) data2[word] = { following: [], followers: [] };
data2[word][path] = words;
return words;
}
async function siteCollector(user, path, site, useLimit, data2) {
let users = [];
let i = 1;
let out = [];
if (site == 'file') {
return await textCollector(user, path, route, data2);
}
if (site == 'url') {
return await urlCollector(user, path, route, useLimit, data2);
}
while (true) {
let p;
if (site == 'darflen') {
p = `https://api.darflen.com/users/${user}/${path}/${i}`;
} else if (site == 'scratch') {
p = `https://api.scratch.mit.edu/users/${user}/${path}/?limit=40&offset=${i * 40}`;
} else {
throw 'That site is not supported.';
}
let j1 = cache[p];
if (!j1) {
let h1 = await fetch(p);
try {
j1 = await h1.json();
} catch (err) {
j1 = [];
}
}
cache[p] = j1;
if (!j1 || (i >= pageLimit && useLimit)) break;
let users2;
if (site == 'darflen') {
users2 = j1[path].map(x => x.profile.username);
} else if (site == 'scratch') {
users2 = j1.map(x => x.username);
}
users.push(users2);
if (users2.length == 0) break;
i++;
console.log(`User ${user} has ${i} pages calculated`);
}
out = out.concat(...users);
if (!data2[user]) data2[user] = { following: [], followers: [] };
data2[user][path] = out;
return out;
}
(async function () {
let penv = process.env.user || 'paradock';
penv = penv.split(',');
let users = [];
let data = {};
for (let u of penv) {
let ca = await siteCollector(u, 'followers', site, false, data);
users = users.concat(ca);
users = users.concat(await siteCollector(u, 'following', site, false, data));
}
users = [...new Set(users)];
let d = process.env.depth || 1;
for (let i = 1; i < d; i++) {
users = [...new Set(users)];
let tempSet = [...users];
for (let u of tempSet) {
users = users.concat(await siteCollector(u, 'followers', site, true, data));
users = users.concat(await siteCollector(u, 'following', site, true, data));
}
}
let p = [];
let congested = [];
for (let u of users) {
let it = 0;
while (p.length >= (process.env.maxRate || 15)) {
p = p.filter(x => x != 'hi');
if (p.length == 0) break;
let pv = await Promise.any(p);
it++;
if (it > 10) {
console.warn(`Promises got congested. Moving to another array...`)
congested = congested.concat(p);
p = [];
break;
}
}
data[u] = { followers: [], following: [] };
p.push(async function (k) {
await siteCollector(u, 'followers', site, true, data);
console.log(`User ${u} followers fully calculated`);
p[k] = 'hi';
}(p.length));
p.push(async function (k) {
await siteCollector(u, 'following', site, true, data);
console.log(`User ${u} following fully calculated`);
p[k] = 'hi';
}(p.length));
}
p = p.concat(congested);
await Promise.any([
new Promise(resolve => setTimeout(resolve, process.env.delay || 1000*60)),
Promise.all(p)
]);
for (let uf in data) {
let u = data[uf];
if (!u) continue;
let { following, followers } = u;
if (!following || !followers) continue;
try {
if (site == 'url') {
let o = new URL(new URL(uf).origin).toString();
if (followers.indexOf(o) == -1) {
followers.push(o)
}
}
} catch (err) {
}
for (let f of followers) {
if (!data[f]) data[f] = { followers: [] };
if (!data[f].following) data[f].following = [];
if (data[f].following.indexOf(uf) == -1) {
data[f].following.push(uf);
}
}
for (let f of following) {
if (!data[f]) data[f] = { following: [] };
if (!data[f].followers) data[f].followers = [];
if (data[f].followers.indexOf(uf) == -1) {
data[f].followers.push(uf);
}
}
}
let dat = Object.entries(rankCalc(data, 100, penv, site == 'url'));
dat = dat.sort((a, b) => a[1] - b[1]);
let dat2 = {};
for (let d of dat) {
dat2[d[0]] = d[1] * 100 + "%";
}
let srz = JSON.stringify(dat2);
await writeFile(`./users.json`, srz, 'utf8');
})()