feat: crawl gitea and get all repository information

This commit is contained in:
Aravinth Manivannan 2022-04-06 10:09:49 +05:30
parent 35aa155c0e
commit 8a305d6db1
Signed by: realaravinth
GPG key ID: AD9F0F08E855ED88
2 changed files with 79 additions and 0 deletions

View file

@ -18,6 +18,7 @@
pub mod data;
pub mod gitea;
pub mod settings;
pub mod spider;
pub mod utils;
pub mod verify;

78
src/spider.rs Normal file
View file

@ -0,0 +1,78 @@
/*
* ForgeFlux StarChart - A federated software forge spider
* Copyright © 2022 Aravinth Manivannan <realaravinth@batsense.net>
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
use std::time::Duration;
use tokio::time;
use url::Url;
use crate::data::Data;
use crate::gitea::SearchResults;
const REPO_SEARCH_PATH: &str = "/api/v1/repos/search";
impl Data {
pub async fn crawl(&self, hostname: &str) -> Vec<SearchResults> {
let mut page = 1;
let mut url = Url::parse(hostname).unwrap();
url.set_path(REPO_SEARCH_PATH);
let mut repos = Vec::new();
loop {
let mut url = url.clone();
url.set_query(Some(&format!(
"page={page}&limit={}",
self.settings.crawler.items_per_api_call
)));
let res: SearchResults = self
.client
.get(url)
.send()
.await
.unwrap()
.json()
.await
.unwrap();
// TODO implement save
time::sleep(Duration::new(
self.settings.crawler.wait_before_next_api_call,
0,
))
.await;
if res.data.is_empty() {
return repos;
}
repos.push(res);
page += 1;
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::settings::Settings;
pub const GITEA_HOST: &str = "http://localhost:8080";
#[actix_rt::test]
async fn crawl_gitea() {
let data = Data::new(Settings::new().unwrap()).await;
let res = data.crawl(GITEA_HOST).await;
let mut elements = 0;
res.iter().for_each(|r| elements += r.data.len());
assert_eq!(res.len(), 5);
assert_eq!(elements, 100);
}
}