bigly-caret/site.js

324 lines
9.4 KiB
JavaScript
Raw Normal View History

2025-01-30 07:59:30 -05:00
2025-01-30 21:00:26 -05:00
import { writeFile, readFile } from "fs/promises";
2025-01-30 07:59:30 -05:00
import { rankCalc } from "./rank.js";
2025-01-31 18:30:18 -05:00
import * as cheerio from 'cheerio';
2025-01-30 07:59:30 -05:00
2025-01-30 08:20:17 -05:00
let cache = {};
2025-01-30 08:42:27 -05:00
let site = process.env.site || 'darflen';
2025-01-31 17:39:47 -05:00
let route = process.env.route || './test.txt';
2025-01-31 17:58:09 -05:00
let pageLimit = process.env.pageLimit || Infinity;
2025-01-31 23:57:07 -05:00
let blacklist = (process.env.blacklist + '').split(',') || [];
let greylist = (process.env.greylist + '').split(',') || [];
2025-02-01 13:24:37 -05:00
let threshold = process.env.threshold || 100;
2025-02-01 13:13:45 -05:00
let rel = (process.env.rel == 'relative');
2025-01-30 21:00:26 -05:00
let hh;
2025-02-01 05:03:24 -05:00
let percent = 0;
let d = process.env.depth || 1;
2025-02-01 12:01:59 -05:00
let noWorry = [];
2025-01-30 21:00:26 -05:00
2025-01-31 22:14:09 -05:00
async function urlCollector(url, path, file, useLimit, data2) {
2025-02-01 11:43:39 -05:00
let urls = data2[url] ? (data2[url][path] || []) : [];
urls = [...urls];
2025-02-01 12:01:59 -05:00
if (path != 'following' || noWorry.indexOf(url) != -1) return urls;
2025-01-31 18:30:18 -05:00
let data;
try {
data = await fetch(url);
data = await data.text();
2025-01-31 21:01:57 -05:00
console.log(`User ${url} was fetched`);
2025-01-31 18:30:18 -05:00
} catch (err) {
2025-02-01 00:43:00 -05:00
console.warn(`User ${url} did not fetch`);
2025-02-01 11:43:39 -05:00
return urls;
2025-01-31 18:30:18 -05:00
}
let body = cheerio.load(data);
let links = body('a');
2025-01-31 22:14:09 -05:00
let ll = urls.length;
2025-02-01 12:01:59 -05:00
let returnedFalse = false;
2025-01-31 22:01:33 -05:00
links.each(function (i, link) {
2025-02-01 11:43:39 -05:00
if (useLimit && urls.length >= pageLimit * 40 + ll) {
2025-02-01 12:01:59 -05:00
returnedFalse = true;
2025-02-01 11:43:39 -05:00
return false;
}
2025-01-31 18:35:22 -05:00
let h = body(link).attr('href');
2025-02-01 11:43:39 -05:00
if (!h) return true;
2025-01-31 18:35:22 -05:00
h = h.trim();
2025-01-31 20:50:06 -05:00
if (h.startsWith('./') || h.startsWith('../') || h.startsWith('/')) {
2025-01-31 18:35:22 -05:00
let u = new URL(url);
u.pathname = h;
2025-01-31 18:35:53 -05:00
h = u.toString();
2025-01-31 18:35:22 -05:00
}
2025-01-31 21:09:13 -05:00
let h2;
2025-01-31 20:21:42 -05:00
try {
h2 = new URL(h);
2025-01-31 22:01:33 -05:00
} catch (err) {
2025-02-01 13:30:21 -05:00
2025-01-31 20:21:42 -05:00
}
2025-02-01 11:43:39 -05:00
if (!h2) return true;
2025-02-01 13:13:45 -05:00
if (rel && h2.host != new URL(url).host) return true;
2025-01-31 22:52:19 -05:00
for (let g of greylist) {
2025-02-01 11:43:39 -05:00
if (h2.toString().includes(g) && g != '') return true;
2025-01-31 22:52:19 -05:00
}
2025-02-01 11:43:39 -05:00
if (blacklist.indexOf(h2.toString()) != -1) return true;
2025-01-31 22:14:09 -05:00
urls.push(h2.toString());
urls = [...new Set(urls)];
console.log(`User ${url} has ${urls.length} pages calculated`);
2025-02-01 11:43:39 -05:00
return true;
2025-01-31 18:30:18 -05:00
})
2025-02-01 12:01:59 -05:00
if (!returnedFalse) {
noWorry.push(url);
}
2025-01-31 21:09:13 -05:00
let h3;
try {
h3 = new URL(new URL(url).origin);
2025-01-31 22:01:33 -05:00
} catch (err) {
2025-01-31 22:52:19 -05:00
2025-01-31 21:09:13 -05:00
}
urls.push(h3.toString())
2025-01-31 22:52:19 -05:00
if (!data2[url]) data2[url] = { following: [], followers: [] };
2025-02-01 11:43:39 -05:00
data2[url][path] = [...new Set(urls)];
2025-01-31 22:52:19 -05:00
2025-02-01 13:30:21 -05:00
data2[url][path].sort((x, y) => ((new URL(x).host == new URL(url).host) ? 1 : 0) - ((new URL(y).host == new URL(url).host) ? 1 : 0))
2025-02-01 13:24:37 -05:00
2025-02-01 11:43:39 -05:00
return data2[url][path];
2025-01-31 18:30:18 -05:00
}
2025-02-01 00:43:00 -05:00
async function rounder(users, data, mode) {
let p = [];
let congested = [];
let timeout = false;
2025-02-01 00:50:27 -05:00
let pr = new Promise(resolve => setTimeout(function (...ag) {
2025-02-01 00:43:00 -05:00
timeout = true;
2025-02-01 00:50:27 -05:00
resolve(ag);
}, (process.env.delay * 1000) || (60 * 1000)))
2025-02-01 00:43:00 -05:00
2025-02-01 11:50:26 -05:00
let ul = 0;
2025-02-01 00:43:00 -05:00
for (let u of users) {
let it = 0;
while (p.length >= (process.env.maxRate || 15)) {
p = p.filter(x => x != 'hi');
2025-02-01 00:54:51 -05:00
if (p.length == 0) break;
let pv = await Promise.any([...p, pr]);
2025-02-01 11:55:05 -05:00
2025-02-01 00:50:27 -05:00
if (timeout) {
console.warn(`Somehow, this is stuck.`);
break;
}
2025-02-01 00:43:00 -05:00
it++;
if (it > 10) {
console.warn(`Promises got congested. Moving to another array...`)
congested = congested.concat(p);
p = [];
break;
}
}
2025-02-01 00:54:51 -05:00
if (timeout) break;
2025-02-01 11:43:39 -05:00
if (!data[u]) data[u] = { followers: [], following: [] };
2025-02-01 12:04:48 -05:00
if (noWorry.indexOf(u) != -1) {
percent += 50 / d / users.length;
console.log(`User ${u} followers was already fully calculated (${percent}% total)`);
continue;
}
2025-02-01 00:43:00 -05:00
p.push(async function (k) {
await siteCollector(u, 'followers', site, mode, data);
2025-02-01 13:30:21 -05:00
2025-02-01 05:03:24 -05:00
percent += 50 / d / users.length;
console.log(`User ${u} followers is fully calculated (${percent}% total)`);
2025-02-01 00:43:00 -05:00
p[k] = 'hi';
}(p.length));
p.push(async function (k) {
await siteCollector(u, 'following', site, mode, data);
2025-02-01 05:03:24 -05:00
percent += 50 / d / users.length;
console.log(`User ${u} following is fully calculated (${percent}% total)`);
2025-02-01 00:43:00 -05:00
p[k] = 'hi';
}(p.length));
}
if (!timeout) {
p = p.concat(congested);
await Promise.all(p);
} else {
console.warn('Took too long...')
}
2025-02-01 01:27:46 -05:00
let endn = [...users];
2025-02-01 01:22:14 -05:00
let oldLength = endn.length;
2025-02-01 00:43:00 -05:00
for (let h in data) {
2025-02-01 11:43:39 -05:00
endn.push(h);
2025-02-01 13:30:21 -05:00
if (endn.length > oldLength * 1.25) {
2025-02-01 05:06:10 -05:00
endn = [...new Set(endn)];
}
2025-02-01 00:43:00 -05:00
}
2025-02-01 05:07:01 -05:00
//let fcg = ((a, b) => ((data[a] ? -data[a].followers.length : 0) - (data[b] ? -data[b].followers.length : 0)));
//endn = endn.sort(fcg);
2025-02-01 00:43:00 -05:00
return endn;
}
2025-01-30 21:00:26 -05:00
async function textCollector(word, path, file) {
2025-01-31 17:39:47 -05:00
if (!hh) hh = await readFile(file, 'utf8');
2025-01-30 21:00:26 -05:00
hh = hh.toLowerCase();
2025-01-31 17:39:47 -05:00
let words = hh.split(/[\n.]+/g).filter(x => word.length == 0 || ` ${x} `.includes(` ${word} `));
2025-01-30 21:00:26 -05:00
words = words.join(' ').split(/[^a-zA-Z0-9']+/g);
words = [...new Set(words)];
2025-01-31 23:57:07 -05:00
if (!data2[word]) data2[word] = { following: [], followers: [] };
data2[word][path] = words;
2025-01-30 21:00:26 -05:00
return words;
}
2025-01-30 07:59:30 -05:00
2025-01-31 23:57:07 -05:00
async function siteCollector(user, path, site, useLimit, data2) {
2025-01-30 08:20:17 -05:00
let users = [];
2025-02-01 01:00:06 -05:00
let urls = data2[user] ? (data2[user][path] || []) : [];
2025-02-01 11:43:39 -05:00
let ul = urls.length;
let i = 1 + (ul || 0);
2025-01-30 20:17:43 -05:00
let out = [];
2025-01-30 21:00:26 -05:00
2025-01-31 17:39:47 -05:00
if (site == 'file') {
2025-01-31 23:57:07 -05:00
return await textCollector(user, path, route, data2);
2025-01-30 21:00:26 -05:00
}
2025-01-31 18:30:18 -05:00
if (site == 'url') {
2025-01-31 23:57:07 -05:00
return await urlCollector(user, path, route, useLimit, data2);
2025-01-31 18:30:18 -05:00
}
2025-01-30 07:59:30 -05:00
while (true) {
2025-01-30 08:42:27 -05:00
let p;
if (site == 'darflen') {
p = `https://api.darflen.com/users/${user}/${path}/${i}`;
} else if (site == 'scratch') {
p = `https://api.scratch.mit.edu/users/${user}/${path}/?limit=40&offset=${(i - 1) * 40}`;
2025-01-30 08:42:27 -05:00
} else {
throw 'That site is not supported.';
}
2025-01-30 08:34:59 -05:00
let j1 = cache[p];
2025-01-30 08:20:17 -05:00
if (!j1) {
let h1 = await fetch(p);
2025-01-30 08:42:27 -05:00
try {
j1 = await h1.json();
2025-01-30 12:40:55 -05:00
} catch (err) {
2025-01-30 08:42:27 -05:00
j1 = [];
}
2025-01-30 08:20:17 -05:00
}
cache[p] = j1;
2025-01-30 07:59:30 -05:00
2025-02-01 11:43:39 -05:00
if (!j1 || ((i - ul - 1) >= pageLimit && useLimit)) break;
2025-01-31 17:56:05 -05:00
2025-01-30 08:42:27 -05:00
let users2;
2025-02-01 01:26:00 -05:00
try {
if (site == 'darflen') {
users2 = j1[path].map(x => x.profile.username);
} else if (site == 'scratch') {
users2 = j1.map(x => x.username);
}
} catch (err) {
users2 = [];
2025-01-30 08:42:27 -05:00
}
2025-01-30 07:59:30 -05:00
2025-01-30 20:17:43 -05:00
users.push(users2);
2025-01-30 07:59:30 -05:00
if (users2.length == 0) break;
i++;
2025-01-30 08:20:17 -05:00
console.log(`User ${user} has ${i} pages calculated`);
2025-01-30 07:59:30 -05:00
}
2025-01-30 20:17:43 -05:00
out = out.concat(...users);
2025-02-01 01:08:09 -05:00
if (useLimit) {
2025-02-01 05:03:24 -05:00
out.length = Math.min(out.length, pageLimit * 40);
2025-02-01 01:08:09 -05:00
}
2025-01-31 23:57:07 -05:00
if (!data2[user]) data2[user] = { following: [], followers: [] };
data2[user][path] = out;
2025-01-30 20:17:43 -05:00
return out;
2025-01-30 08:20:17 -05:00
}
(async function () {
2025-01-30 12:40:55 -05:00
let penv = process.env.user || 'paradock';
2025-01-31 17:42:18 -05:00
penv = penv.split(',');
2025-01-30 12:40:55 -05:00
2025-01-31 18:14:00 -05:00
let users = [];
2025-01-31 22:14:09 -05:00
let data = {};
2025-01-31 18:19:18 -05:00
2025-02-01 00:50:27 -05:00
users = await rounder(penv, data, false);
2025-01-30 08:20:17 -05:00
2025-01-31 18:14:00 -05:00
users = [...new Set(users)];
2025-02-01 10:17:59 -05:00
let dat;
for (let i = 0; i < d; i++) {
if (i != 0) {
let tempSet = dat.map(x => x[0]);
2025-02-01 13:30:21 -05:00
let oldLength = Object.keys(data).length;
let theData = {};
for (let a = 0; a < oldLength * threshold && a < oldLength; a++) {
let key = Object.keys(data)[a];
theData[key] = data[key];
}
users = tempSet.concat(await rounder(tempSet, theData, true));
2025-02-01 10:17:59 -05:00
users = [...new Set(users)];
}
for (let uf of users) {
let u = data[uf];
2025-02-01 11:43:39 -05:00
if (!u) {
2025-02-01 13:30:21 -05:00
data[uf] = { following: [], followers: [] }
2025-02-01 11:43:39 -05:00
}
2025-02-01 10:17:59 -05:00
let { following, followers } = u;
if (!following || !followers) continue;
2025-01-31 20:43:50 -05:00
2025-02-01 10:17:59 -05:00
try {
if (site == 'url') {
let o = new URL(new URL(uf).origin).toString();
2025-01-31 20:43:50 -05:00
2025-02-01 10:17:59 -05:00
if (followers.indexOf(o) == -1) {
followers.push(o)
}
2025-01-31 23:53:27 -05:00
}
2025-02-01 10:17:59 -05:00
} catch (err) {
2025-01-31 20:43:50 -05:00
}
2025-02-01 10:17:59 -05:00
for (let f of followers) {
if (!data[f]) data[f] = { followers: [] };
if (!data[f].following) data[f].following = [];
if (data[f].following.indexOf(uf) == -1) {
data[f].following.push(uf);
}
2025-01-31 18:30:18 -05:00
}
2025-02-01 10:17:59 -05:00
for (let f of following) {
if (!data[f]) data[f] = { following: [] };
if (!data[f].followers) data[f].followers = [];
if (data[f].followers.indexOf(uf) == -1) {
data[f].followers.push(uf);
}
2025-01-31 18:30:18 -05:00
}
}
2025-02-01 10:17:59 -05:00
console.log(`Graph is fully repaired`);
dat = Object.entries(rankCalc(data, (i == d - 1) ? process.env.matrixIterations : 3, penv, site == 'url'));
2025-02-01 13:30:21 -05:00
dat = dat.sort((a, b) => b[1] - a[1]);
console.log(`Graph is calculated with ${dat.length} entries`);
2025-02-01 10:17:59 -05:00
let dat2 = {};
for (let d of dat) {
dat2[d[0]] = d[1] * 100 + "%";
2025-02-01 01:19:59 -05:00
}
2025-02-01 13:30:21 -05:00
2025-02-01 10:17:59 -05:00
let srz = JSON.stringify(dat2);
let ff = `./users_${i}.json`;
await writeFile(ff, srz, 'utf8');
console.log(`Temporary file ${ff} is written`);
2025-02-01 01:19:59 -05:00
}
2025-02-01 10:17:59 -05:00
console.log(`Graph is complete (${Object.keys(users).length} entries)`);
2025-02-01 13:30:21 -05:00
2025-01-30 07:59:30 -05:00
})()