Merge pull request 'feat: crawl forgeflux adapters' (#3) from crawl-forgeflux into master

Reviewed-on: #3
This commit is contained in:
Aravinth Manivannan 2025-02-09 13:07:34 +05:30
commit e486e6d6a8
10 changed files with 390 additions and 9 deletions

View file

@ -3,3 +3,4 @@ export SQLITE_TMP="$(pwd)/db/db-sqlx-sqlite/tmp"
export SQLITE_DATABASE_URL="sqlite://$SQLITE_TMP/admin.db"
export STARCHART__CRAWLER__WAIT_BEFORE_NEXT_API_CALL=0
export FORGEJO_HOST=http://localhost:3000
export FORGEFLUX_HOST=http://localhost:7000

View file

@ -7,9 +7,11 @@ steps:
POSTGRES_DATABASE_URL: postgres://postgres:password@database:5432/postgres
commands:
- pip install requests
- sed -i 's/localhost/forgejo/' scripts/gitea.py
- sed -i 's/localhost\:3000/forgejo/' scripts/gitea.py
- python ./scripts/gitea.py
# - curl -vv http://forgejo/api/v1/repos/bot/repository_58
# - curl -vv http://forge_forgeflux/forgejo/bot/repository_58
# - curl -vv http://forge_forgeflux/forgejo/bot/repository_01
test:
image: rust
@ -18,7 +20,8 @@ steps:
environment:
POSTGRES_DATABASE_URL: postgres://postgres:password@database:5432/postgres
SQLITE_DATABASE_URL: sqlite:///tmp/admin.db
FORGEJO_HOST: http://forgejo:3000
FORGEJO_HOST: http://forgejo
FORGEFLUX_HOST: http://forge_forgeflux
commands:
# - curl -fsSL https://deb.nodesource.com/setup_16.x | bash - &&\
# - apt update && apt-get -y --no-install-recommends install nodejs tar gpg curl wget
@ -32,7 +35,6 @@ steps:
# - make lint
- make test.workspaces
build_docker_img:
image: plugins/docker
when:
@ -42,6 +44,20 @@ steps:
repo: forgeflux/starchart
tags: latest
build_and_publish_docker_img:
image: plugins/docker
when:
event: [push, tag, deployment]
branch: master
settings:
username: forgeflux
password:
from_secret: DOCKER_TOKEN
repo: forgeflux/starchart
tags:
latest
services:
forgejo:
image: codeberg.org/forgejo/forgejo:9
@ -49,9 +65,47 @@ services:
FORGEJO__security__INSTALL_LOCK: true
FORGEJO__federation__ENABLED: true
FORGEJO__server__ROOT_URL: http://forgejo
FORGEJO__server__HTTP_PORT: 3000
FORGEJO__server__HTTP_PORT: 80
database:
image: postgres
environment:
POSTGRES_PASSWORD: password
forge_forgeflux_database:
image: postgres
environment:
POSTGRES_PASSWORD: password
forge_forgeflux:
image: forgeflux/forgeflux:latest
depends_on:
- forgeflux_postgres
environment:
FORGEFLUX_server_PUBLIC_URL_HAS_HTTPS: false
FORGEFLUX_debug: true
FORGEFLUX_source_code: https://git.batsense.net/ForgeFlux/ForgeFlux
FORGEFLUX_allow_registration: true
FORGEFLUX_database_POOL: 2
FORGEFLUX_forges_FORGEJO_url: http://forgejo
FORGEFLUX_forges_FORGEJO_client_id: foo
FORGEFLUX_forges_FORGEJO_client_secret: foo
FORGEFLUX_forges_FORGEJO_user_USERNAME: foo
FORGEFLUX_forges_FORGEJO_user_API_TOKEN: foo
DATABASE_URL: postgres://postgres:password@forgeflux_postgres:5432/postgres
PORT: 80
FORGEFLUX_server_DOMAIN: forge_forgeflux
FORGEFLUX_server_COOKIE_SECRET: 995cde0721b6e41602dd111438cc7c1b2506dc14bad31d2653fb9a4adce1f84e
FORGEFLUX_server_IP: 0.0.0.0
FORGEFLUX_forges_GITHUB_url: https://github.com
FORGEFLUX_forges_GITHUB_api_url: https://api.github.com
FORGEFLUX_forges_GITHUB_client_id: foo
FORGEFLUX_forges_GITHUB_client_secret: foo
FORGEFLUX_forges_GITHUB_user_USERNAME: foo
FORGEFLUX_forges_GITHUB_user_PERSONAL_ACCESS_TOKEN: foo
FORGEFLUX_starchart_enable: true
forgeflux_postgres:
image: postgres:17.2
environment:
POSTGRES_PASSWORD: password # change password

17
Cargo.lock generated
View file

@ -1,6 +1,6 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 3
version = 4
[[package]]
name = "actix"
@ -1147,6 +1147,20 @@ dependencies = [
"url",
]
[[package]]
name = "forgeflux"
version = "0.1.0"
dependencies = [
"actix-rt",
"async-trait",
"forge-core",
"reqwest",
"serde",
"serde_json",
"tokio",
"url",
]
[[package]]
name = "form_urlencoded"
version = "1.0.1"
@ -2922,6 +2936,7 @@ dependencies = [
"derive_more",
"federate-core",
"forge-core",
"forgeflux",
"gitea",
"lazy_static",
"log",

View file

@ -16,6 +16,7 @@ members = [
"db/db-core",
"db/db-sqlx-sqlite",
"forge/forge-core",
"forge/forgeflux",
"forge/gitea",
"federate/federate-core",
"federate/publiccodeyml"
@ -76,6 +77,9 @@ path = "./db/db-sqlx-sqlite"
[dependencies.gitea]
path = "./forge/gitea"
[dependencies.forgeflux]
path = "./forge/forgeflux"
[dependencies.forge-core]
path = "./forge/forge-core"

View file

@ -309,6 +309,8 @@ impl Clone for Box<dyn SCDatabase> {
pub enum ForgeImplementation {
/// [Gitea](https://gitea.io) softare forge
Gitea,
/// [ForgeFlux](https://net.forgeflux.net)
ForgeFlux,
}
impl ForgeImplementation {
@ -316,6 +318,7 @@ impl ForgeImplementation {
pub const fn to_str(&self) -> &'static str {
match self {
ForgeImplementation::Gitea => "gitea",
ForgeImplementation::ForgeFlux => "forgeflux",
}
}
}
@ -326,9 +329,11 @@ impl FromStr for ForgeImplementation {
/// Convert [str] to [ForgeImplementation]
fn from_str(s: &str) -> DBResult<Self> {
const GITEA: &str = ForgeImplementation::Gitea.to_str();
const FORGEFLUX: &str = ForgeImplementation::ForgeFlux.to_str();
let s = s.trim();
match s {
GITEA => Ok(Self::Gitea),
FORGEFLUX => Ok(Self::ForgeFlux),
_ => Err(DBError::UnknownForgeType(s.to_owned())),
}
}

View file

@ -0,0 +1,37 @@
[package]
name = "forgeflux"
version = "0.1.0"
authors = ["realaravinth <realaravinth@batsense.net>"]
description = "ForgeFlux StarChart - Federated forge spider"
documentation = "https://forgeflux.org/"
edition = "2021"
license = "AGPLv3 or later version"
[lib]
name = "forgeflux"
path = "src/lib.rs"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
async-trait = "0.1.51"
url = { version = "2.2.2", features = ["serde"] }
tokio = { version = "1.17", features = ["time"] }
[dependencies.forge-core]
path = "../forge-core"
[dependencies.reqwest]
features = ["rustls-tls-native-roots", "gzip", "deflate", "brotli", "json"]
version = "0.11.10"
[dependencies.serde]
features = ["derive"]
version = "1"
[dependencies.serde_json]
version = "1"
[dev-dependencies]
actix-rt = "2.7"

187
forge/forgeflux/src/lib.rs Normal file
View file

@ -0,0 +1,187 @@
/*
* ForgeFlux StarChart - A federated software forge spider
* Copyright © 2022 Aravinth Manivannan <realaravinth@batsense.net>
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
use std::sync::Arc;
use std::time::Duration;
use reqwest::Client;
use tokio::task::JoinHandle;
use url::Url;
use db_core::ForgeImplementation;
use forge_core::dev::*;
use forge_core::Repository;
pub mod schema;
const REPO_SEARCH_PATH: &str = "/search/repositories";
const FORGEFLUX_NODEINFO: &str = "/nodeinfo/2.0";
const FORGEFLUX_IDENTIFIER: &str = "forgeflux";
#[derive(Clone)]
pub struct ForgeFlux {
pub instance_url: Url,
pub client: Client,
url: Url,
}
impl ForgeFlux {
pub fn new(instance_url: Url, client: Client) -> Self {
let url = Url::parse(&db_core::clean_url(&instance_url)).unwrap();
Self {
instance_url,
client,
url,
}
}
}
impl PartialEq for ForgeFlux {
fn eq(&self, other: &Self) -> bool {
self.url == other.url && self.instance_url == other.instance_url
}
}
#[async_trait]
impl SCForge for ForgeFlux {
async fn is_forge(&self) -> bool {
true
}
fn get_url(&self) -> &Url {
&self.url
}
fn forge_type(&self) -> ForgeImplementation {
ForgeImplementation::ForgeFlux
}
async fn crawl(&self, limit: u64, page: u64, rate_limit: u64) -> CrawlResp {
fn empty_is_none(s: &str) -> Option<String> {
let s = s.trim();
if s.is_empty() {
None
} else {
Some(s.to_owned())
}
}
let mut tags = Tags::default();
let mut users = UserMap::default();
let mut internal_users = UserMap::default();
let mut repos = Repositories::default();
let instance_url = self.instance_url.clone();
let mut url = instance_url.clone();
url.set_path(REPO_SEARCH_PATH);
url.set_query(Some(&format!("page={page}&limit={limit}")));
let mut res: Vec<schema::Repository> = self
.client
.get(url)
.send()
.await
.unwrap()
.json()
.await
.unwrap();
let mut sleep_fut: Option<JoinHandle<()>> = None;
for repo in res.drain(0..) {
let user = if internal_users.contains_key(&repo.attributed_to.to_string()) {
if let Some(sleep_fut) = sleep_fut {
sleep_fut.await.unwrap();
}
let user: schema::User = self
.client
.get(repo.attributed_to.clone())
.send()
.await
.unwrap()
.json()
.await
.unwrap();
sleep_fut = Some(tokio::spawn(tokio::time::sleep(Duration::new(
rate_limit, 0,
))));
let profile_photo = if let Some(profile_photo) = user.icon {
Some(profile_photo.url.to_string())
} else {
None
};
let f_user = Arc::new(User {
url: user.id.clone(),
username: Arc::new(user.preferred_username),
html_link: user.id.to_string(),
profile_photo,
});
users.insert(f_user.username.clone(), f_user.clone());
users.insert(Arc::new(f_user.url.to_string()), f_user.clone());
f_user
} else {
internal_users
.get(&repo.attributed_to.to_string())
.unwrap()
.clone()
};
let frepo = Repository {
url: self.url.clone(),
website: None,
name: repo.name,
owner: user,
html_link: repo.id.to_string(),
tags: None,
description: Some(repo.summary),
};
repos.push(frepo);
}
CrawlResp { repos, tags, users }
}
}
#[cfg(test)]
mod tests {
use super::*;
use url::Url;
pub const NET_REPOSITORIES: u64 = 0;
pub const PER_CRAWL: u64 = 10;
#[actix_rt::test]
async fn forgeflux_works() {
let ctx = ForgeFlux::new(
Url::parse(&std::env::var("FORGEFLUX_HOST").unwrap()).unwrap(),
Client::new(),
);
assert!(ctx.is_forge().await);
let steps = NET_REPOSITORIES / PER_CRAWL;
for i in 0..steps {
let res = ctx.crawl(PER_CRAWL, i, 0).await;
assert_eq!(res.repos.len() as u64, PER_CRAWL);
}
}
}

View file

@ -0,0 +1,69 @@
/*
* ForgeFlux StarChart - A federated software forge spider
* Copyright © 2usize22 Aravinth Manivannan <realaravinth@batsense.net>
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
use serde::{Deserialize, Serialize};
use url::Url;
#[derive(Clone, Debug, Deserialize, Serialize, Eq, PartialEq)]
#[serde(rename_all = "camelCase")]
pub struct ProfilePhoto {
pub url: Url,
pub media_type: String,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct User {
pub name: String,
pub preferred_username: String,
pub id: Url,
pub url: Option<Url>,
pub icon: Option<ProfilePhoto>,
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct Repository {
pub id: Url,
pub clone_uri: Url,
pub inbox: Url,
pub name: String,
pub attributed_to: Url,
pub summary: String,
}
#[cfg(test)]
mod tests {
use super::*;
use std::fs;
#[test]
/// Tests if Gitea responses panic when deserialized with serde into structs defined in this
/// module/file. Since Go doesn't have abilities to describe nullable values, I(@realaravinth)
/// am forced to do this as I my knowledge about Gitea codebase is very limited.
fn schema_doesnt_panic() {
let files = ["./tests/schema/forgeflux/net.forgeflux.org.json"];
for file in files.iter() {
let contents = fs::read_to_string(file).unwrap();
for line in contents.lines() {
let _: Vec<Repository> =
serde_json::from_str(line).expect("Forgeflux schema paniced");
}
}
}
}

File diff suppressed because one or more lines are too long

View file

@ -25,6 +25,7 @@ use url::Url;
use db_core::prelude::*;
use forge_core::prelude::*;
use forgeflux::ForgeFlux;
use gitea::Gitea;
use crate::ctx::Ctx;
@ -37,9 +38,16 @@ impl Ctx {
info!("[crawl][{instance_url}] Init crawling");
let forge: Box<dyn SCForge> =
Box::new(Gitea::new(instance_url.clone(), self.client.clone()));
if !forge.is_forge().await {
unimplemented!("Forge type unimplemented");
}
let forge = if forge.is_forge().await {
forge
} else {
let forgeflux = Box::new(ForgeFlux::new(instance_url.clone(), self.client.clone()));
if forgeflux.is_forge().await {
forgeflux
} else {
unimplemented!("Forge type unimplemented");
}
};
let mut page = 1;
let url = forge.get_url();