import { writeFile, readFile } from "fs/promises"; import { rankCalc } from "./rank.js"; import * as cheerio from 'cheerio'; let cache = {}; let site = process.env.site || 'darflen'; let route = process.env.route || './test.txt'; let pageLimit = process.env.pageLimit || Infinity; let blacklist = (process.env.blacklist + '').split(',') || []; let greylist = (process.env.greylist + '').split(',') || []; let hh; let percent = 0; let d = process.env.depth || 1; async function urlCollector(url, path, file, useLimit, data2) { let urls = data2[url] ? (data2[url][path] || []) : []; urls = [...urls]; if (path != 'following') return urls; let data; try { data = await fetch(url); data = await data.text(); console.log(`User ${url} was fetched`); } catch (err) { console.warn(`User ${url} did not fetch`); return urls; } let body = cheerio.load(data); let links = body('a'); let ll = urls.length; links.each(function (i, link) { if (useLimit && urls.length >= pageLimit * 40 + ll) { return false; } let h = body(link).attr('href'); if (!h) return true; h = h.trim(); if (h.startsWith('./') || h.startsWith('../') || h.startsWith('/')) { let u = new URL(url); u.pathname = h; h = u.toString(); } let h2; try { h2 = new URL(h); } catch (err) { } if (!h2) return true; for (let g of greylist) { if (h2.toString().includes(g) && g != '') return true; } if (blacklist.indexOf(h2.toString()) != -1) return true; urls.push(h2.toString()); urls = [...new Set(urls)]; console.log(`User ${url} has ${urls.length} pages calculated`); return true; }) let h3; try { h3 = new URL(new URL(url).origin); } catch (err) { } urls.push(h3.toString()) if (!data2[url]) data2[url] = { following: [], followers: [] }; data2[url][path] = [...new Set(urls)]; return data2[url][path]; } async function rounder(users, data, mode) { let p = []; let congested = []; let timeout = false; let pr = new Promise(resolve => setTimeout(function (...ag) { timeout = true; resolve(ag); }, (process.env.delay * 1000) || (60 * 1000))) let ul = 0; for (let u of users) { let it = 0; while (p.length >= (process.env.maxRate || 15)) { p = p.filter(x => x != 'hi'); if (p.length == 0) break; let pv = await Promise.any([...p, pr]); if (timeout) { console.warn(`Somehow, this is stuck.`); break; } it++; if (it > 10) { console.warn(`Promises got congested. Moving to another array...`) congested = congested.concat(p); p = []; break; } } if (timeout) break; if (!data[u]) data[u] = { followers: [], following: [] }; p.push(async function (k) { await siteCollector(u, 'followers', site, mode, data); percent += 50 / d / users.length; console.log(`User ${u} followers is fully calculated (${percent}% total)`); p[k] = 'hi'; }(p.length)); p.push(async function (k) { await siteCollector(u, 'following', site, mode, data); percent += 50 / d / users.length; console.log(`User ${u} following is fully calculated (${percent}% total)`); p[k] = 'hi'; }(p.length)); } if (!timeout) { p = p.concat(congested); await Promise.all(p); } else { console.warn('Took too long...') } let endn = [...users]; let oldLength = endn.length; for (let h in data) { endn.push(h); if (endn.length > oldLength * 1.25 ) { endn = [...new Set(endn)]; } } //let fcg = ((a, b) => ((data[a] ? -data[a].followers.length : 0) - (data[b] ? -data[b].followers.length : 0))); //endn = endn.sort(fcg); return endn; } async function textCollector(word, path, file) { if (!hh) hh = await readFile(file, 'utf8'); hh = hh.toLowerCase(); let words = hh.split(/[\n.]+/g).filter(x => word.length == 0 || ` ${x} `.includes(` ${word} `)); words = words.join(' ').split(/[^a-zA-Z0-9']+/g); words = [...new Set(words)]; if (!data2[word]) data2[word] = { following: [], followers: [] }; data2[word][path] = words; return words; } async function siteCollector(user, path, site, useLimit, data2) { let users = []; let urls = data2[user] ? (data2[user][path] || []) : []; let ul = urls.length; let i = 1 + (ul || 0); let out = []; if (site == 'file') { return await textCollector(user, path, route, data2); } if (site == 'url') { return await urlCollector(user, path, route, useLimit, data2); } while (true) { let p; if (site == 'darflen') { p = `https://api.darflen.com/users/${user}/${path}/${i}`; } else if (site == 'scratch') { p = `https://api.scratch.mit.edu/users/${user}/${path}/?limit=40&offset=${(i - 1) * 40}`; } else { throw 'That site is not supported.'; } let j1 = cache[p]; if (!j1) { let h1 = await fetch(p); try { j1 = await h1.json(); } catch (err) { j1 = []; } } cache[p] = j1; if (!j1 || ((i - ul - 1) >= pageLimit && useLimit)) break; let users2; try { if (site == 'darflen') { users2 = j1[path].map(x => x.profile.username); } else if (site == 'scratch') { users2 = j1.map(x => x.username); } } catch (err) { users2 = []; } users.push(users2); if (users2.length == 0) break; i++; console.log(`User ${user} has ${i} pages calculated`); } out = out.concat(...users); if (useLimit) { out.length = Math.min(out.length, pageLimit * 40); } if (!data2[user]) data2[user] = { following: [], followers: [] }; data2[user][path] = out; return out; } (async function () { let penv = process.env.user || 'paradock'; penv = penv.split(','); let users = []; let data = {}; users = await rounder(penv, data, false); users = [...new Set(users)]; let dat; for (let i = 0; i < d; i++) { if (i != 0) { let tempSet = dat.map(x => x[0]); users = tempSet.concat(await rounder(tempSet, data, true)); users = [...new Set(users)]; } for (let uf of users) { let u = data[uf]; if (!u) { data[uf] = {following: [], followers: []} } let { following, followers } = u; if (!following || !followers) continue; try { if (site == 'url') { let o = new URL(new URL(uf).origin).toString(); if (followers.indexOf(o) == -1) { followers.push(o) } } } catch (err) { } for (let f of followers) { if (!data[f]) data[f] = { followers: [] }; if (!data[f].following) data[f].following = []; if (data[f].following.indexOf(uf) == -1) { data[f].following.push(uf); } } for (let f of following) { if (!data[f]) data[f] = { following: [] }; if (!data[f].followers) data[f].followers = []; if (data[f].followers.indexOf(uf) == -1) { data[f].followers.push(uf); } } } console.log(`Graph is fully repaired`); dat = Object.entries(rankCalc(data, (i == d - 1) ? process.env.matrixIterations : 3, penv, site == 'url')); dat = dat.sort((a, b) => b[1] - a[1]); console.log(`Graph is calculated with ${dat.length} entries`); let dat2 = {}; for (let d of dat) { dat2[d[0]] = d[1] * 100 + "%"; } let srz = JSON.stringify(dat2); let ff = `./users_${i}.json`; await writeFile(ff, srz, 'utf8'); console.log(`Temporary file ${ff} is written`); } console.log(`Graph is complete (${Object.keys(users).length} entries)`); })()