import { writeFile, readFile } from "fs/promises"; import { rankCalc } from "./rank.js"; import * as cheerio from 'cheerio'; let cache = {}; let hh; let percent = 0; let noWorry = {}; async function urlCollector(url, path, file, useLimit, data2, settings) { let { pageLimit } = settings; let blacklist = (settings.blacklist + '').split(','); let greyList = (settings.greyList + '').split(','); let urls = data2[url] ? (data2[url][path] || []) : []; urls = [...urls]; if (path != 'following') return urls; if (noWorry[url]) return [...noWorry[url]] let data; try { data = await fetch(url); if (!data.headers.get("content-type").startsWith('text/html')) return urls; data = await data.text(); console.log(`User ${url} was fetched`); } catch (err) { console.warn(`User ${url} did not fetch`); return urls; } let body = cheerio.load(data); let links = body('a'); let ll = urls.length; let returnedFalse = false; links.each(function (i, link) { if (useLimit && urls.length >= pageLimit * 40 + ll) { returnedFalse = true; return false; } let h = body(link).attr('href'); if (!h) return true; h = h.trim(); if (h.startsWith('./') || h.startsWith('../') || h.startsWith('/')) { let u = new URL(url); u.pathname = h; h = u.toString(); } if (h.startsWith('?')) { let u = new URL(url); u.search = h; h = u.toString(); } if (h.startsWith('#')) { let u = new URL(url); h = u.toString(); } let h2; try { h2 = new URL(h); } catch (err) { } if (!h2) return true; if (settings.isRelative == 'relative' && h2.host != new URL(url).host) return true; for (let g of greyList) { if (h2.toString().includes(g) && g != '') return true; } if (blacklist.indexOf(h2.toString()) != -1) return true; urls.push(h2.toString()); urls = [...new Set(urls)]; console.log(`User ${url} has ${urls.length} pages calculated`); return true; }) let h3; try { h3 = new URL(new URL(url).origin); } catch (err) { } urls.push(h3.toString()) urls.push(url.split('?')[0]); if (!data2[url]) data2[url] = { following: [], followers: [] }; data2[url][path] = [...new Set(urls)]; if (!returnedFalse) { noWorry[url] = [...data2[url][path]]; } return data2[url][path]; } async function rounder(users, data, mode, settings) { let p = []; let congested = []; let timeout = false; let { delay, fetchRate, depth } = settings; let pr = new Promise(resolve => setTimeout(function (...ag) { timeout = true; resolve(ag); }, delay)) let ul = 0; for (let u of users) { let it = 0; while (p.length >= (fetchRate)) { p = p.filter(x => x != 'hi'); if (p.length == 0) break; let pv = await Promise.any([...p, pr]); if (timeout) { console.warn(`Somehow, this is stuck.`); break; } it++; if (it > 10) { console.warn(`Promises got congested. Moving to another array...`) congested = congested.concat(p); p = []; break; } } if (timeout) break; if (!data[u]) data[u] = { followers: [], following: [] }; if (noWorry[u]) { percent += 50 / depth / users.length; console.log(`User ${u} followers was already fully calculated (${percent}% total)`); continue; } p.push(async function (k) { await siteCollector(u, 'followers', settings.site, mode, data, settings); percent += 50 / depth / users.length; console.log(`User ${u} followers is fully calculated (${percent}% total)`); p[k] = 'hi'; }(p.length)); p.push(async function (k) { await siteCollector(u, 'following', settings.site, mode, data, settings); percent += 50 / depth / users.length; console.log(`User ${u} following is fully calculated (${percent}% total)`); p[k] = 'hi'; }(p.length)); } if (!timeout) { p = p.concat(congested); await Promise.all(p); } else { console.warn('Took too long...') } let endn = [...users]; let oldLength = endn.length; for (let h in data) { endn.push(h); if (endn.length > oldLength * 1.25) { endn = [...new Set(endn)]; } } //let fcg = ((a, b) => ((data[a] ? -data[a].followers.length : 0) - (data[b] ? -data[b].followers.length : 0))); //endn = endn.sort(fcg); return endn; } async function textCollector(word, path, file, data2) { if (!hh) hh = await readFile(file, 'utf8'); hh = hh.toLowerCase(); let words = hh.split(/[\n.]+/g).filter(x => word.length == 0 || ` ${x} `.includes(` ${word} `)); words = words.join(' ').split(/[^a-zA-Z0-9']+/g); words = [...new Set(words)]; if (!data2[word]) data2[word] = { following: [], followers: [] }; data2[word][path] = words; return words; } async function siteCollector(user, path, site, useLimit, data2, settings) { let { route, pageLimit } = settings; let users = []; let urls = data2[user] ? (data2[user][path] || []) : []; let ul = urls.length; let i = 1 + (ul || 0); let out = []; if (site == 'file') { return await textCollector(user, path, route, data2); } if (site == 'url') { return await urlCollector(user, path, route, useLimit, data2, settings); } while (true) { let p; if (site == 'darflen') { p = `https://api.darflen.com/users/${user}/${path}/${i}`; } else if (site == 'scratch') { p = `https://api.scratch.mit.edu/users/${user}/${path}/?limit=40&offset=${(i - 1) * 40}`; } else { throw 'That site is not supported.'; } let j1 = cache[p]; if (!j1) { let h1 = await fetch(p); try { j1 = await h1.json(); } catch (err) { j1 = []; } } cache[p] = j1; if (!j1 || ((i - ul - 1) >= pageLimit && useLimit)) break; let users2; try { if (site == 'darflen') { users2 = j1[path].map(x => x.profile.username); } else if (site == 'scratch') { users2 = j1.map(x => x.username); } } catch (err) { users2 = []; } users.push(users2); if (users2.length == 0) break; i++; console.log(`User ${user} has ${i} pages calculated`); } out = out.concat(...users); if (useLimit) { out.length = Math.min(out.length, pageLimit * 40); } if (!data2[user]) data2[user] = { following: [], followers: [] }; data2[user][path] = out; return out; } async function main(settings) { let { site, discardThreshold, depth, user, matrixIterations, useArchive, } = settings; user = user.split(','); let users = []; let data = {}; try { let fil = await readFile(`./net_${btoa(penv[0])}.json`); data = JSON.parse(fil); console.log(`Archive found.`) } catch (err) { data = {}; } users = await rounder(user, data, false, settings); users = [...new Set(users)]; let dat; for (let i = 0; i < depth; i++) { if (i != 0) { let tempSet = dat.map(x => x[0]); let kk = Object.keys(data); //kk = kk.sort((x, y) => ((new URL(x).host == new URL(penv[0]).host) ? 0 : 1) - ((new URL(y).host == new URL(penv[0]).host) ? 0 : 1)); console.log(kk) let oldLength = kk.length; let theData = {}; for (let a = 0; a < oldLength * discardThreshold && a < oldLength; a++) { let key = kk[a]; theData[key] = data[key]; } users = tempSet.concat(await rounder(tempSet, theData, true, settings)); users = [...new Set(users)]; } for (let uf of users) { let u = data[uf]; if (!u) { data[uf] = { following: [], followers: [] } } let { following, followers } = u; if (!following || !followers) continue; try { if (site == 'url') { let o = new URL(new URL(uf).origin).toString(); if (followers.indexOf(o) == -1) { followers.push(o) } } } catch (err) { } for (let f of followers) { if (!data[f]) data[f] = { followers: [] }; if (!data[f].following) data[f].following = []; if (data[f].following.indexOf(uf) == -1) { data[f].following.push(uf); } } for (let f of following) { if (!data[f]) data[f] = { following: [] }; if (!data[f].followers) data[f].followers = []; if (data[f].followers.indexOf(uf) == -1) { data[f].followers.push(uf); } } } console.log(`Graph is fully repaired`); let calcedRank = rankCalc(data, (i == depth - 1) ? matrixIterations : 3, user, site == 'url', settings.isGpu) dat = Object.entries(calcedRank); dat = dat.sort((a, b) => b[1] - a[1]); console.log(`Graph is calculated with ${dat.length} entries`); let dat2 = {}; for (let d of dat) { dat2[d[0]] = d[1] * 100 + "%"; } let srz = JSON.stringify(dat2); let ff = `./users_${i}_${btoa(user[0])}_${+new Date()}.json`; await writeFile(ff, srz, 'utf8'); console.log(`Temporary file ${ff} is written`); if (useArchive == 'use') { ff = `./net_${btoa(user[0])}.json`; await writeFile(ff, JSON.stringify(data), 'utf8'); console.log(`Temporary file ${ff} is written`); } } console.log(`Graph is complete (${Object.keys(users).length} entries)`); }; export { main };