118 lines
3.5 KiB
Rust
118 lines
3.5 KiB
Rust
|
/*
|
||
|
* ForgeFlux StarChart - A federated software forge spider
|
||
|
* Copyright © 2022 Aravinth Manivannan <realaravinth@batsense.net>
|
||
|
*
|
||
|
* This program is free software: you can redistribute it and/or modify
|
||
|
* it under the terms of the GNU Affero General Public License as
|
||
|
* published by the Free Software Foundation, either version 3 of the
|
||
|
* License, or (at your option) any later version.
|
||
|
*
|
||
|
* This program is distributed in the hope that it will be useful,
|
||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
|
* GNU Affero General Public License for more details.
|
||
|
*
|
||
|
* You should have received a copy of the GNU Affero General Public License
|
||
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||
|
*/
|
||
|
use std::time::Duration;
|
||
|
use tokio::time;
|
||
|
use url::Url;
|
||
|
|
||
|
use crate::data::Data;
|
||
|
use crate::gitea::SearchResults;
|
||
|
|
||
|
const REPO_SEARCH_PATH: &str = "/api/v1/repos/search";
|
||
|
const GITEA_NODEINFO: &str = "/api/v1/nodeinfo";
|
||
|
|
||
|
impl Data {
|
||
|
pub async fn crawl(&self, hostname: &str) -> Vec<SearchResults> {
|
||
|
let mut page = 1;
|
||
|
let mut url = Url::parse(hostname).unwrap();
|
||
|
url.set_path(REPO_SEARCH_PATH);
|
||
|
let mut repos = Vec::new();
|
||
|
loop {
|
||
|
let mut url = url.clone();
|
||
|
url.set_query(Some(&format!(
|
||
|
"page={page}&limit={}",
|
||
|
self.settings.crawler.items_per_api_call
|
||
|
)));
|
||
|
let res: SearchResults = self
|
||
|
.client
|
||
|
.get(url)
|
||
|
.send()
|
||
|
.await
|
||
|
.unwrap()
|
||
|
.json()
|
||
|
.await
|
||
|
.unwrap();
|
||
|
// TODO implement save
|
||
|
time::sleep(Duration::new(
|
||
|
self.settings.crawler.wait_before_next_api_call,
|
||
|
0,
|
||
|
))
|
||
|
.await;
|
||
|
if res.data.is_empty() {
|
||
|
return repos;
|
||
|
}
|
||
|
|
||
|
for d in res.data.iter() {
|
||
|
|
||
|
}
|
||
|
|
||
|
repos.push(res);
|
||
|
page += 1;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/// purpose: interact with instance running on provided hostname and verify if the instance is a
|
||
|
/// Gitea instance.
|
||
|
///
|
||
|
/// will get nodeinfo information, which contains an identifier to uniquely identify Gitea
|
||
|
pub async fn is_gitea(&self, hostname: &str) -> bool {
|
||
|
const GITEA_IDENTIFIER: &str = "gitea";
|
||
|
let mut url = Url::parse(hostname).unwrap();
|
||
|
url.set_path(GITEA_NODEINFO);
|
||
|
|
||
|
let res: serde_json::Value = self
|
||
|
.client
|
||
|
.get(url)
|
||
|
.send()
|
||
|
.await
|
||
|
.unwrap()
|
||
|
.json()
|
||
|
.await
|
||
|
.unwrap();
|
||
|
if let serde_json::Value::String(software) = &res["software"]["name"] {
|
||
|
software == GITEA_IDENTIFIER
|
||
|
} else {
|
||
|
false
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
#[cfg(test)]
|
||
|
mod tests {
|
||
|
use super::*;
|
||
|
use crate::settings::Settings;
|
||
|
pub const GITEA_HOST: &str = "http://localhost:8080";
|
||
|
|
||
|
#[actix_rt::test]
|
||
|
async fn is_gitea_works() {
|
||
|
let data = Data::new(Settings::new().unwrap()).await;
|
||
|
assert!(data.is_gitea(GITEA_HOST).await);
|
||
|
}
|
||
|
|
||
|
#[actix_rt::test]
|
||
|
async fn crawl_gitea() {
|
||
|
use crate::tests::sqlx_sqlite;
|
||
|
let data = Data::new(Settings::new().unwrap()).await;
|
||
|
let db = sqlx_sqlite::get_data();
|
||
|
let res = data.crawl(GITEA_HOST).await;
|
||
|
let mut elements = 0;
|
||
|
res.iter().for_each(|r| elements += r.data.len());
|
||
|
assert_eq!(res.len(), 5);
|
||
|
assert_eq!(elements, 100);
|
||
|
}
|
||
|
}
|