/* * ForgeFlux StarChart - A federated software forge spider * Copyright © 2022 Aravinth Manivannan * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . */ use std::future::Future; use std::sync::Arc; use std::sync::RwLock; use log::info; use tokio::sync::oneshot::{error::TryRecvError, Receiver}; use url::Url; use db_core::prelude::*; use forge_core::prelude::*; use gitea::Gitea; use crate::ctx::Ctx; use crate::db::BoxDB; use crate::federate::ArcFederate; use crate::ArcCtx; impl Ctx { pub async fn crawl(&self, instance_url: &Url, db: &BoxDB, federate: &ArcFederate) { info!("[crawl][{instance_url}] Init crawling"); let forge: Box = Box::new(Gitea::new(instance_url.clone(), self.client.clone())); if !forge.is_forge().await { unimplemented!("Forge type unimplemented"); } let mut page = 1; let url = forge.get_url(); if !db.forge_exists(url).await.unwrap() { info!("[crawl][{url}] Creating forge"); let msg = CreateForge { url: url.clone(), forge_type: forge.forge_type(), starchart_url: None, }; db.create_forge_instance(&msg).await.unwrap(); } else if !federate.forge_exists(url).await.unwrap() { let forge = db.get_forge(url).await.unwrap(); let msg = CreateForge { url: url.clone(), forge_type: forge.forge_type, starchart_url: None, }; federate.create_forge_instance(&msg).await.unwrap(); } loop { info!("[crawl][{url}] Crawling. page: {page}"); let res = forge .crawl( self.settings.crawler.items_per_api_call, page, self.settings.crawler.wait_before_next_api_call, ) .await; if res.repos.is_empty() { info!("[crawl][{url}] Finished crawling. pages: {}", page - 1); break; } for (username, u) in res.users.iter() { if !db .user_exists(username, Some(forge.get_url())) .await .unwrap() { info!("[crawl][{url}] Creating user: {username}"); let msg = u.as_ref().into(); db.add_user(&msg).await.unwrap(); federate.create_user(&msg).await.unwrap(); } if !federate .user_exists(username, forge.get_url()) .await .unwrap() { let msg = u.as_ref().into(); federate.create_user(&msg).await.unwrap(); } } for r in res.repos.iter() { if !db .repository_exists(&r.name, &r.owner.username, &r.url) .await .unwrap() { info!("[crawl][{url}] Creating repository: {}", r.name); let msg = r.into(); db.create_repository(&msg).await.unwrap(); federate.create_repository(&msg).await.unwrap(); } if !federate .repository_exists(&r.name, &r.owner.username, &r.url) .await .unwrap() { let msg = r.into(); federate.create_repository(&msg).await.unwrap(); } } page += 1; } } } pub struct Crawler { rx: RwLock>>, ctx: ArcCtx, db: BoxDB, federate: ArcFederate, } impl Crawler { pub fn new(rx: Receiver, ctx: ArcCtx, db: BoxDB, federate: ArcFederate) -> Arc { let rx = RwLock::new(Some(rx)); Arc::new(Self { rx, ctx, db, federate, }) } pub fn is_running(&self) -> bool { self.rx.read().unwrap().is_some() } fn shutdown(&self) -> bool { let res = if let Some(rx) = self.rx.write().unwrap().as_mut() { // let rx = self.rx.as_mut().unwrap(); match rx.try_recv() { // The channel is currently empty Ok(x) => { info!("Received signal from tx: {}", x); x } Err(TryRecvError::Empty) => false, Err(TryRecvError::Closed) => { info!("Closed"); true } } } else { true }; if res { let mut rx = self.rx.write().unwrap(); *rx = None; } res } // static is justified since the crawler will be initialized when the program starts // and only shutdown when the program exits pub async fn start(c: Arc) -> impl Future { if c.shutdown() { info!("Stopping crawling job"); return tokio::spawn(tokio::time::sleep(std::time::Duration::new(0, 0))); } let fut = async move { const LIMIT: u32 = 2; let mut page = 0; loop { info!("Running crawling job"); let offset = page * LIMIT; if c.shutdown() { break; } let forges = c.db.get_all_forges(false, offset, LIMIT).await.unwrap(); if forges.is_empty() { c.federate.tar().await.unwrap(); page = 0; tokio::time::sleep(std::time::Duration::new(c.ctx.settings.crawler.ttl, 0)) .await; if c.shutdown() { info!("Stopping crawling job"); break; } continue; } for forge in forges.iter() { if c.shutdown() { info!("Stopping crawling job"); break; } c.ctx .crawl(&Url::parse(&forge.url).unwrap(), &c.db, &c.federate) .await; page += 1; } if c.shutdown() { info!("Stopping crawling job"); break; } } }; tokio::spawn(fut) } } #[cfg(test)] mod tests { use crate::tests::sqlx_sqlite; use url::Url; pub const GITEA_HOST: &str = "http://localhost:8080"; pub const GITEA_USERNAME: &str = "bot"; #[actix_rt::test] async fn crawl_gitea() { let (db, ctx, federate, _tmp_dir) = sqlx_sqlite::get_ctx().await; let url = Url::parse(GITEA_HOST).unwrap(); ctx.crawl(&url, &db, &federate).await; // let hostname = get_hostname(&Url::parse(GITEA_HOST).unwrap()); assert!(db.forge_exists(&url).await.unwrap()); assert!(db.user_exists(GITEA_USERNAME, Some(&url)).await.unwrap()); assert!(db.user_exists(GITEA_USERNAME, None).await.unwrap()); for i in 0..100 { let repo = format!("repository_{i}"); assert!(db .repository_exists(&repo, GITEA_USERNAME, &url) .await .unwrap()) } assert!(db.forge_exists(&url).await.unwrap()); } // #[actix_rt::test] // async fn crawlerd() { // use super::*; //use tokio::sync::oneshot; // // let (db, ctx, federate, _tmp_dir) = sqlx_sqlite::get_ctx().await; // let (kill_crawler, rx) = oneshot::channel(); // let crawler = Crawler::new(rx, ctx.clone(), db, federate); // let fut = tokio::spawn(Crawler::start(crawler.clone())); // assert!(crawler.is_running()); // // tokio::time::sleep(std::time::Duration::new(2, 0)) // .await; // // // kill_crawler.send(true).unwrap(); // tokio::time::sleep(std::time::Duration::new(2, 0)) // .await; // // // fut.await.unwrap().await; // assert!(!crawler.is_running()); // } }