feat: crawl gitea and get all repository information
This commit is contained in:
parent
35aa155c0e
commit
8a305d6db1
2 changed files with 79 additions and 0 deletions
|
@ -18,6 +18,7 @@
|
|||
pub mod data;
|
||||
pub mod gitea;
|
||||
pub mod settings;
|
||||
pub mod spider;
|
||||
pub mod utils;
|
||||
pub mod verify;
|
||||
|
||||
|
|
78
src/spider.rs
Normal file
78
src/spider.rs
Normal file
|
@ -0,0 +1,78 @@
|
|||
/*
|
||||
* ForgeFlux StarChart - A federated software forge spider
|
||||
* Copyright © 2022 Aravinth Manivannan <realaravinth@batsense.net>
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Affero General Public License as
|
||||
* published by the Free Software Foundation, either version 3 of the
|
||||
* License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Affero General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Affero General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
use std::time::Duration;
|
||||
use tokio::time;
|
||||
use url::Url;
|
||||
|
||||
use crate::data::Data;
|
||||
use crate::gitea::SearchResults;
|
||||
|
||||
const REPO_SEARCH_PATH: &str = "/api/v1/repos/search";
|
||||
|
||||
impl Data {
|
||||
pub async fn crawl(&self, hostname: &str) -> Vec<SearchResults> {
|
||||
let mut page = 1;
|
||||
let mut url = Url::parse(hostname).unwrap();
|
||||
url.set_path(REPO_SEARCH_PATH);
|
||||
let mut repos = Vec::new();
|
||||
loop {
|
||||
let mut url = url.clone();
|
||||
url.set_query(Some(&format!(
|
||||
"page={page}&limit={}",
|
||||
self.settings.crawler.items_per_api_call
|
||||
)));
|
||||
let res: SearchResults = self
|
||||
.client
|
||||
.get(url)
|
||||
.send()
|
||||
.await
|
||||
.unwrap()
|
||||
.json()
|
||||
.await
|
||||
.unwrap();
|
||||
// TODO implement save
|
||||
time::sleep(Duration::new(
|
||||
self.settings.crawler.wait_before_next_api_call,
|
||||
0,
|
||||
))
|
||||
.await;
|
||||
if res.data.is_empty() {
|
||||
return repos;
|
||||
}
|
||||
repos.push(res);
|
||||
page += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::settings::Settings;
|
||||
pub const GITEA_HOST: &str = "http://localhost:8080";
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn crawl_gitea() {
|
||||
let data = Data::new(Settings::new().unwrap()).await;
|
||||
let res = data.crawl(GITEA_HOST).await;
|
||||
let mut elements = 0;
|
||||
res.iter().for_each(|r| elements += r.data.len());
|
||||
assert_eq!(res.len(), 5);
|
||||
assert_eq!(elements, 100);
|
||||
}
|
||||
}
|
Loading…
Reference in a new issue