feat: username save on spider::crawl

This commit is contained in:
Aravinth Manivannan 2022-04-30 23:51:38 +05:30
parent cb9b8d6d0d
commit 3d98149244
Signed by: realaravinth
GPG key ID: AD9F0F08E855ED88
2 changed files with 36 additions and 5 deletions

1
Cargo.lock generated
View file

@ -379,6 +379,7 @@ dependencies = [
"serde 1.0.136", "serde 1.0.136",
"serde_json", "serde_json",
"thiserror", "thiserror",
"url",
] ]
[[package]] [[package]]

View file

@ -16,6 +16,7 @@
* along with this program. If not, see <http://www.gnu.org/licenses/>. * along with this program. If not, see <http://www.gnu.org/licenses/>.
*/ */
use std::time::Duration; use std::time::Duration;
use tokio::time; use tokio::time;
use url::Url; use url::Url;
@ -31,8 +32,8 @@ const GITEA_NODEINFO: &str = "/api/v1/nodeinfo";
impl Data { impl Data {
pub async fn crawl(&self, hostname: &str, db: &BoxDB) -> Vec<SearchResults> { pub async fn crawl(&self, hostname: &str, db: &BoxDB) -> Vec<SearchResults> {
let mut page = 1; let mut page = 1;
let mut url = Url::parse(hostname).unwrap(); let mut instance_url = Url::parse(hostname).unwrap();
let hostname = url.host().as_ref().unwrap().to_string(); let hostname = get_hostname(&instance_url);
if !db.forge_exists(&hostname).await.unwrap() { if !db.forge_exists(&hostname).await.unwrap() {
let msg = CreateForge { let msg = CreateForge {
hostname: &hostname, hostname: &hostname,
@ -41,6 +42,7 @@ impl Data {
db.create_forge_isntance(&msg).await.unwrap(); db.create_forge_isntance(&msg).await.unwrap();
} }
let mut url = instance_url.clone();
url.set_path(REPO_SEARCH_PATH); url.set_path(REPO_SEARCH_PATH);
let mut repos = Vec::new(); let mut repos = Vec::new();
loop { loop {
@ -59,11 +61,31 @@ impl Data {
.await .await
.unwrap(); .unwrap();
time::sleep(Duration::new( let sleep_fut = time::sleep(Duration::new(
self.settings.crawler.wait_before_next_api_call, self.settings.crawler.wait_before_next_api_call,
0, 0,
)) ));
.await; let sleep_fut = tokio::spawn(sleep_fut);
for repo in res.data.iter() {
if !db
.user_exists(&repo.owner.username, Some(&hostname))
.await
.unwrap()
{
let mut profile_url = instance_url.clone();
profile_url.set_path(&repo.owner.username);
let msg = AddUser {
hostname: &hostname,
username: &repo.owner.username,
html_link: profile_url.as_str(),
profile_photo: Some(&repo.owner.avatar_url),
};
db.add_user(&msg).await.unwrap();
}
}
sleep_fut.await.unwrap();
if res.data.is_empty() { if res.data.is_empty() {
return repos; return repos;
} }
@ -101,6 +123,9 @@ impl Data {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use crate::tests::sqlx_sqlite; use crate::tests::sqlx_sqlite;
use db_core::prelude::*;
use url::Url;
pub const GITEA_HOST: &str = "http://localhost:8080"; pub const GITEA_HOST: &str = "http://localhost:8080";
@ -115,7 +140,12 @@ mod tests {
let (db, data) = sqlx_sqlite::get_data().await; let (db, data) = sqlx_sqlite::get_data().await;
let res = data.crawl(GITEA_HOST, &db).await; let res = data.crawl(GITEA_HOST, &db).await;
let mut elements = 0; let mut elements = 0;
let username = &res.get(0).unwrap().data.get(0).unwrap().owner.username;
let hostname = get_hostname(&Url::parse(GITEA_HOST).unwrap());
assert!(db.forge_exists(&hostname).await.unwrap());
assert!(db.user_exists(username, Some(&hostname)).await.unwrap());
res.iter().for_each(|r| elements += r.data.len()); res.iter().for_each(|r| elements += r.data.len());
assert_eq!(res.len(), 5); assert_eq!(res.len(), 5);
assert_eq!(elements, 100); assert_eq!(elements, 100);
} }