diff options
| author | grothedev <grothedev@gmail.com> | 2025-10-26 19:10:59 -0400 |
|---|---|---|
| committer | grothedev <grothedev@gmail.com> | 2025-10-26 19:10:59 -0400 |
| commit | 3dadb3aa1920f25a7f6d4b4775a83cabdbd8275b (patch) | |
| tree | c27772a438203706fc3c212184268bfbb2ebf6b5 | |
first commit. almost all claude. now time to review
47 files changed, 3255 insertions, 0 deletions
diff --git a/IMPLEMENTATION.md b/IMPLEMENTATION.md new file mode 100644 index 0000000..7985246 --- /dev/null +++ b/IMPLEMENTATION.md @@ -0,0 +1,304 @@ +# Implementation Overview + +## Summary + +This project creates a single bootable ISO that automatically configures itself as part of a Kubernetes cluster with integrated distributed services (Ceph, Kafka, MQTT, DNS). All services are managed directly by systemd. + +## Architecture + +### Boot Flow + +``` +┌─────────────────────────────────────────────────────────┐ +│ 1. System Boots from ISO │ +└────────────────┬────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────┐ +│ 2. cluster-detect.service (Very Early Boot) │ +│ - Runs cluster-detect.sh │ +│ - Detects node identity (MAC/IP/hostname) │ +│ - Creates /etc/cluster-config/current-node.yaml │ +│ - Writes /etc/cluster-config/node-identity │ +└────────────────┬────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────┐ +│ 3. Environment File Generation │ +│ - Runs generate-environment-files.sh │ +│ - Creates /etc/cluster-config/environment/*.env │ +│ - Extracts node IP, cluster settings, etc. │ +└────────────────┬────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────┐ +│ 4. Role Activation │ +│ - Runs cluster-activate-roles.sh │ +│ - Maps roles to systemd targets │ +│ - Enables and starts targets │ +└────────────────┬────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────┐ +│ 5. Service Startup (Dependency Order) │ +│ - containerd.service │ +│ - etcd.service (masters only) │ +│ - kube-apiserver.service (masters only) │ +│ - kube-controller-manager.service (masters only) │ +│ - kube-scheduler.service (masters only) │ +│ - kubelet.service (all nodes) │ +│ - kafka.service (kafka nodes) │ +│ - ceph-mon@.service (ceph-mon nodes) │ +│ - ceph-osd@.service (ceph-osd nodes) │ +│ - mosquitto.service (mqtt nodes) │ +│ - coredns.service (dns nodes) │ +└─────────────────────────────────────────────────────────┘ +``` + +## Components + +### Configuration Files (configs/) + +#### cluster.yaml +- Defines entire cluster topology +- Lists all nodes with IPs, hostnames, roles +- Specifies enabled services +- Network configuration (pod CIDR, service CIDR) + +#### services/*.yaml (5 files) +- kubernetes.yaml - K8s component configuration +- ceph.yaml - Ceph storage settings +- kafka.yaml - Kafka broker configuration +- mqtt.yaml - MQTT broker settings +- dns.yaml - CoreDNS configuration + +#### nodes/*.yaml (5 files) +- master-01.yaml - Control plane node +- worker-01.yaml - Worker node +- worker-02.yaml - Worker + Ceph OSD +- kafka-01.yaml - Worker + Kafka broker +- storage-01.yaml - Worker + Ceph mon + OSD + +### Systemd Units (systemd/) + +#### Services (11 files) +1. **containerd.service** - Container runtime for Kubernetes +2. **kubelet.service** - Kubernetes node agent +3. **kube-apiserver.service** - Kubernetes API server +4. **kube-controller-manager.service** - K8s controller manager +5. **kube-scheduler.service** - K8s scheduler +6. **etcd.service** - Key-value store for K8s +7. **kafka.service** - Kafka broker (KRaft mode) +8. **ceph-mon@.service** - Ceph monitor (template) +9. **ceph-osd@.service** - Ceph OSD (template) +10. **mosquitto.service** - MQTT broker +11. **coredns.service** - DNS server + +#### Targets (7 files) +- **kubernetes-master.target** - Pulls in K8s control plane services +- **kubernetes-worker.target** - Pulls in kubelet +- **kafka.target** - Pulls in Kafka broker +- **ceph-mon.target** - Pulls in Ceph monitor +- **ceph-osd.target** - Pulls in Ceph OSD +- **mqtt.target** - Pulls in Mosquitto +- **dns.target** - Pulls in CoreDNS + +#### Special Service +- **cluster-detect.service** - Runs very early to detect node identity + +### Tools (tools/) + +#### Core Scripts (12 files) + +**Detection & Activation:** +1. **cluster-detect.sh** - Node identity detection (MAC/IP/hostname) +2. **cluster-activate-roles.sh** - Map roles to systemd targets +3. **generate-environment-files.sh** - Create env files for services + +**Service Configuration Generators:** +4. **kubelet-config-generator.sh** - Generate kubelet config.yaml +5. **kube-apiserver-config-generator.sh** - Pre-start checks for API server +6. **etcd-config-generator.sh** - Initialize etcd data directory +7. **kafka-config-generator.sh** - Generate Kafka server.properties +8. **ceph-mon-init.sh** - Initialize Ceph monitor +9. **ceph-osd-init.sh** - Initialize Ceph OSD +10. **mosquitto-config-generator.sh** - Generate mosquitto.conf +11. **coredns-config-generator.sh** - Generate CoreDNS Corefile + +**Validation:** +12. **validate-config.py** - Validate cluster configuration before build + +## Role-to-Target Mapping + +| Role | Systemd Target | Services Started | +|------|----------------|------------------| +| master / control-plane | kubernetes-master.target | kubelet, kube-apiserver, kube-controller-manager, kube-scheduler, etcd | +| worker | kubernetes-worker.target | kubelet | +| kafka-broker | kafka.target | kafka | +| ceph-mon | ceph-mon.target | ceph-mon@node | +| ceph-osd | ceph-osd.target | ceph-osd@X (per device) | +| mqtt-broker | mqtt.target | mosquitto | +| dns-server | dns.target | coredns | + +## File Locations (On Installed System) + +### Configuration +``` +/etc/cluster-config/ +├── cluster.yaml # Full cluster topology +├── current-node.yaml # Symlink to this node's config +├── node-identity # This node's name +├── services/ # Service configs +│ ├── kubernetes.yaml +│ ├── ceph.yaml +│ ├── kafka.yaml +│ ├── mqtt.yaml +│ └── dns.yaml +├── nodes/ # All node configs +│ ├── master-01.yaml +│ ├── worker-01.yaml +│ └── ... +└── environment/ # Generated env files + ├── kubelet.env + ├── kube-apiserver.env + ├── kafka.env + └── ... +``` + +### Scripts +``` +/usr/local/bin/ +├── cluster-detect.sh +├── cluster-activate-roles.sh +├── generate-environment-files.sh +├── kubelet-config-generator.sh +├── kafka-config-generator.sh +└── ... +``` + +### Systemd Units +``` +/etc/systemd/system/ +├── cluster-detect.service +├── containerd.service +├── kubelet.service +├── kube-apiserver.service +├── kubernetes-master.target +├── kafka.service +└── ... +``` + +### Data Directories +``` +/var/lib/ +├── kubelet/ # Kubelet data and configs +├── etcd/ # etcd data +├── kafka/ # Kafka logs and data +├── ceph/ # Ceph data +│ ├── mon/ +│ └── osd/ +└── mosquitto/ # MQTT persistence +``` + +## Configuration Generation Process + +1. **Build time**: User edits configs/ directory +2. **Validation**: `validate-config.py` ensures correctness +3. **ISO creation**: All configs embedded into ISO (future work) +4. **First boot**: `cluster-detect.sh` identifies node +5. **Environment generation**: `generate-environment-files.sh` creates .env files +6. **Service startup**: Each service's ExecStartPre runs config generator +7. **Runtime**: Services read from generated configs + +## Security Considerations + +### PKI/Certificates +- **Kubernetes**: Requires CA, API server, kubelet, etcd certs +- **Ceph**: Requires cephx authentication keys +- **MQTT**: Password file and ACLs + +**TODO**: Certificate generation not yet implemented + +### Service Hardening +All services use systemd security features: +- `NoNewPrivileges=true` +- `ProtectHome=true` +- `ProtectSystem=strict/full` +- `PrivateTmp=true` +- Limited capabilities (where applicable) + +## Next Steps + +### Critical Path to Working System +1. **Certificate/Key Generation** + - Script to generate Kubernetes PKI + - Script to generate Ceph keys + - MQTT password management + +2. **Network Configuration** + - Static IP assignment + - Network interface configuration + - Calico CNI installation + +3. **Cluster Bootstrapping** + - First master initialization + - Join tokens for workers + - Multi-master etcd cluster setup + - Ceph cluster initialization + +4. **ISO Builder** + - Take configs/ + base OS → bootable ISO + - Integrate kickstart/cloud-init + - Embed all scripts and systemd units + +### Nice to Have +- Monitoring (Prometheus/Grafana) +- Logging (Loki/journald) +- Update mechanism +- Rollback support +- Interactive TUI for node selection +- Web dashboard for cluster status + +## Testing Strategy + +### Unit Testing +- Validate each config generator script +- Test role-to-target mapping +- Verify YAML parsing + +### Integration Testing +- Boot test in VMs +- Multi-node cluster formation +- Service startup ordering +- Failure recovery + +### End-to-End Testing +- Full cluster deployment +- Workload deployment +- Storage provisioning +- Message broker connectivity + +## Known Limitations + +1. **Certificate generation not implemented** - Manual PKI setup required +2. **Single master only** - Multi-master etcd cluster needs work +3. **No network config** - Assumes static IPs or DHCP reservations +4. **Ceph bootstrap incomplete** - Mon/OSD initialization stubs only +5. **No update mechanism** - Fresh install only +6. **No secrets management** - Passwords and keys in plain text + +## Project Statistics + +- **Configuration files**: 11 (1 cluster + 5 services + 5 nodes) +- **Systemd units**: 19 (11 services + 7 targets + 1 cluster-detect) +- **Scripts**: 12 tools +- **Total files**: 42+ +- **Lines of code**: ~2500+ (estimated) + +## References + +- [Kubernetes Documentation](https://kubernetes.io/docs/) +- [Ceph Documentation](https://docs.ceph.com/) +- [Kafka Documentation](https://kafka.apache.org/documentation/) +- [systemd Documentation](https://systemd.io/) +- [CoreDNS Documentation](https://coredns.io/) diff --git a/README.md b/README.md new file mode 100644 index 0000000..34d5d85 --- /dev/null +++ b/README.md @@ -0,0 +1,238 @@ +# Cluster-from-SystemD + +A specialized Linux distribution that boots directly into being a Kubernetes cluster node, with all distributed systems components (Ceph, Kafka, MQTT, DNS) managed by systemd. + +## Overview + +This project creates a **single bootable ISO** that can be installed on any node in a cluster. Each system boots and automatically detects its role in the cluster, then starts the appropriate services via systemd. + +### Key Features + +- **Single ISO for entire cluster** - No need to build separate images per node +- **Automatic node detection** - System identifies itself by MAC address, IP, or hostname +- **Role-based service activation** - systemd targets start services based on node roles +- **Declarative YAML configuration** - Simple, human-readable cluster topology +- **Configuration validation** - Ensure correctness before building ISO +- **Fedora/Rocky-based** - Targeting RHEL ecosystem for enterprise compatibility + +## Project Structure + +``` +cluster-from-systemd/ +├── configs/ # Cluster configuration files +│ ├── cluster.yaml # Cluster topology and global settings +│ ├── services/ # Service-specific configurations +│ │ ├── kubernetes.yaml +│ │ ├── ceph.yaml +│ │ ├── kafka.yaml +│ │ ├── mqtt.yaml +│ │ └── dns.yaml +│ └── nodes/ # Per-node configurations +│ ├── master-01.yaml +│ ├── worker-01.yaml +│ ├── worker-02.yaml +│ ├── kafka-01.yaml +│ └── storage-01.yaml +├── tools/ # Build and management tools +│ ├── validate-config.py # Validate cluster configuration +│ ├── cluster-detect.sh # Node identity detection +│ └── cluster-activate-roles.sh # Activate systemd targets by role +├── systemd/ # Systemd unit files +│ ├── cluster-detect.service +│ ├── kubernetes-master.target +│ ├── kubernetes-worker.target +│ ├── kafka.target +│ ├── ceph-mon.target +│ ├── ceph-osd.target +│ ├── mqtt.target +│ └── dns.target +├── spec.md # Original specification +└── config-schema.md # Detailed configuration documentation +``` + +## Quick Start + +### 1. Validate Your Configuration + +```bash +python3 tools/validate-config.py configs/ +``` + +This checks: +- Valid YAML syntax +- No duplicate IPs, MACs, or node names +- At least one master node exists +- All enabled services have configs +- Node configs match cluster topology + +### 2. Customize Your Cluster + +Edit `configs/cluster.yaml` to define your cluster topology: + +```yaml +cluster: + name: "my-cluster" + domain: "cluster.local" + +nodes: + - name: "master-01" + ip: "192.168.1.10" + roles: ["master", "control-plane"] + + - name: "worker-01" + ip: "192.168.1.20" + roles: ["worker"] +``` + +Edit node configs in `configs/nodes/` to add hardware identifiers: + +```yaml +node: + name: "master-01" + hardware: + mac_addresses: + - "52:54:00:12:34:10" +``` + +### 3. Build ISO (Coming Soon) + +```bash +# Not yet implemented +./tools/build-iso.sh configs/ -o cluster.iso +``` + +## How It Works + +### Boot Sequence + +1. **System boots** from ISO +2. **`cluster-detect.service`** runs early (before network services) +3. **Node detection**: + - Compares system MAC addresses to `configs/nodes/*.yaml` + - Falls back to IP address matching + - Falls back to hostname matching + - Final fallback: interactive console prompt +4. **Identity established**: + - Creates `/etc/cluster-config/current-node.yaml` (symlink to detected node) + - Writes `/etc/cluster-config/node-identity` +5. **Role activation**: + - Reads roles from node config + - Enables and starts systemd targets per role: + - `master` → `kubernetes-master.target` + - `worker` → `kubernetes-worker.target` + - `kafka-broker` → `kafka.target` + - `ceph-osd` → `ceph-osd.target` + - etc. +6. **Services start** based on enabled targets + +### Supported Roles + +- **master** / **control-plane** - Kubernetes control plane +- **worker** - Kubernetes worker node +- **kafka-broker** - Kafka message broker +- **kafka-controller** - Kafka controller (KRaft mode) +- **ceph-mon** - Ceph monitor daemon +- **ceph-osd** - Ceph object storage daemon +- **ceph-mds** - Ceph metadata server +- **mqtt-broker** - MQTT message broker +- **dns-server** - DNS server (CoreDNS) + +## Configuration Files + +### Cluster Config (`configs/cluster.yaml`) +Defines the entire cluster topology, network settings, and which services are enabled. + +### Node Configs (`configs/nodes/*.yaml`) +Per-node settings including: +- Node name, hostname, IP +- Roles +- Hardware identifiers (MAC addresses) +- Service-specific overrides +- Resource hints (CPU, memory, storage) + +### Service Configs (`configs/services/*.yaml`) +Service-specific configuration including: +- Version information +- Runtime configuration +- Systemd unit dependencies +- Feature flags and tuning parameters + +See [config-schema.md](config-schema.md) for detailed documentation. + +## Implementation Status + +### ✅ Completed +- **Configuration system** (11 files) + - Cluster topology schema + - 5 node configurations (master, workers, kafka, storage) + - 5 service configurations (k8s, ceph, kafka, mqtt, dns) + - Configuration validator (`validate-config.py`) +- **Boot-time detection** (3 scripts) + - Node identity detection (`cluster-detect.sh`) + - Role activation (`cluster-activate-roles.sh`) + - Environment file generator (`generate-environment-files.sh`) +- **Systemd integration** (19 units) + - 11 service units (containerd, kubelet, kube-apiserver, etcd, kafka, ceph, mqtt, coredns, etc.) + - 7 role-based targets + - 1 early-boot detection service +- **Service configuration generators** (8 scripts) + - Kubernetes component configs + - Kafka broker properties + - Ceph initialization (stubs) + - MQTT and DNS configs + +**Total: 42+ files, ~2500+ lines of code** + +### 📋 Next Steps (Critical Path) +1. **Certificate/Key Generation** + - Kubernetes PKI generation scripts + - Ceph cephx key generation + - MQTT password management +2. **Network Configuration** + - Static IP assignment on boot + - Calico CNI deployment +3. **Cluster Bootstrapping** + - Multi-master etcd cluster setup + - Worker join tokens + - Ceph monitor quorum formation +4. **ISO Builder Tool** + - Package configs + OS → bootable ISO + - Integrate with Fedora/Rocky installer + +### 🎯 Future Enhancements +- Monitoring (Prometheus/Grafana) +- Logging aggregation +- Update and rollback mechanisms +- Web UI for cluster management + +## Development + +### Testing Node Detection + +```bash +# Set CONFIG_DIR to test locally +export CONFIG_DIR=/home/thomas/dev/cluster-from-systemd/configs +./tools/cluster-detect.sh +``` + +### Adding a New Service + +1. Create service config in `configs/services/my-service.yaml` +2. Add service to `configs/cluster.yaml` enabled list +3. Create systemd target in `systemd/my-service.target` +4. Update role mapping in `tools/cluster-activate-roles.sh` +5. Run validator: `python3 tools/validate-config.py configs/` + +## Contributing + +This is an experimental project. Contributions welcome! + +## License + +TBD + +## See Also + +- [spec.md](spec.md) - Original project specification +- [config-schema.md](config-schema.md) - Detailed configuration documentation +- [IMPLEMENTATION.md](IMPLEMENTATION.md) - Complete implementation overview with architecture diagrams diff --git a/STATUS.md b/STATUS.md new file mode 100644 index 0000000..371bfbd --- /dev/null +++ b/STATUS.md @@ -0,0 +1,327 @@ +# Project Status Report + +**Generated**: 2025-10-26 +**Project**: cluster-from-systemd +**Version**: 0.1.0-alpha + +## Executive Summary + +✅ **Configuration system complete and functional** +✅ **Boot-time detection system implemented** +✅ **All major service units created** +✅ **Configuration validation passing** + +## What Works Now + +### 1. Configuration Management ✅ +- Define entire cluster topology in YAML +- 5 pre-configured node types (master, workers, kafka, storage) +- 5 service configurations (Kubernetes, Ceph, Kafka, MQTT, DNS) +- Comprehensive validation tool catches errors before build + +**Test it:** +```bash +python3 tools/validate-config.py configs/ +# Output: ✓ Validation PASSED +``` + +### 2. Node Detection System ✅ +- Automatically identifies which node the system is on boot +- Detection methods: MAC address → IP address → hostname → interactive +- Creates symlink to node-specific configuration +- Generates environment files for all services + +**Components:** +- `tools/cluster-detect.sh` - Main detection logic +- `tools/generate-environment-files.sh` - Creates .env files +- `systemd/cluster-detect.service` - Runs at early boot + +### 3. Role-Based Service Activation ✅ +- Maps node roles to systemd targets +- Automatically enables and starts appropriate services +- Supports multi-role nodes (e.g., worker + kafka-broker) + +**Role mappings:** +- master → kubernetes-master.target → api-server, scheduler, controller, etcd +- worker → kubernetes-worker.target → kubelet +- kafka-broker → kafka.target → kafka.service +- ceph-osd → ceph-osd.target → ceph-osd@.service + +### 4. Systemd Service Units ✅ +**11 Service Units Created:** +1. containerd.service - Container runtime +2. kubelet.service - K8s node agent +3. kube-apiserver.service - K8s API server +4. kube-controller-manager.service - K8s controller +5. kube-scheduler.service - K8s scheduler +6. etcd.service - Distributed key-value store +7. kafka.service - Kafka broker (KRaft mode) +8. ceph-mon@.service - Ceph monitor +9. ceph-osd@.service - Ceph OSD +10. mosquitto.service - MQTT broker +11. coredns.service - DNS server + +**7 Target Units:** +- kubernetes-master.target +- kubernetes-worker.target +- kafka.target +- ceph-mon.target +- ceph-osd.target +- mqtt.target +- dns.target + +### 5. Service Configuration Generators ✅ +**8 Configuration Generator Scripts:** +- kubelet-config-generator.sh +- kube-apiserver-config-generator.sh +- etcd-config-generator.sh +- kafka-config-generator.sh +- ceph-mon-init.sh +- ceph-osd-init.sh +- mosquitto-config-generator.sh +- coredns-config-generator.sh + +These run at service startup to generate runtime configs from cluster YAML. + +## Project Statistics + +``` +Total Files: 42 +Total Lines: 2,064 +Configuration: 11 files (cluster + services + nodes) +Systemd Units: 19 files (services + targets) +Scripts: 12 files (bash + python) +Documentation: 4 files (README, spec, schema, implementation) +``` + +## Architecture Diagram + +``` +┌──────────────┐ +│ ISO Boot │ +└──────┬───────┘ + │ + ▼ +┌─────────────────────────┐ +│ cluster-detect.service │ ← Very early boot +│ - Detect node identity │ +│ - Generate env files │ +│ - Activate roles │ +└──────┬──────────────────┘ + │ + ▼ +┌──────────────────────────────────────────┐ +│ Systemd Targets │ +│ ┌────────────┐ ┌──────────┐ │ +│ │ k8s-master │ │ k8s-work │ ┌──────┐ │ +│ │ .target │ │ er.target│ │kafka │ │ +│ └─────┬──────┘ └────┬─────┘ │.tgt │ │ +└────────┼──────────────┼────────┴───┬───┘ + │ │ │ + ▼ ▼ ▼ +┌─────────────┐ ┌──────────┐ ┌────────┐ +│ API Server │ │ Kubelet │ │ Kafka │ +│ Controller │ │ │ │ Broker │ +│ Scheduler │ │ │ │ │ +│ etcd │ │ │ │ │ +└─────────────┘ └──────────┘ └────────┘ +``` + +## What's Missing (Critical Path) + +### 1. Certificate Generation 🔴 +**Priority: CRITICAL** + +The Kubernetes components require a full PKI: +- CA certificate and key +- API server certificate +- Kubelet certificates +- etcd certificates +- Service account keys + +**Action needed:** +- Script to generate all required certificates +- Distribution to appropriate nodes +- Secure key storage + +### 2. Network Configuration 🔴 +**Priority: CRITICAL** + +Systems need network setup before services start: +- Static IP assignment based on cluster.yaml +- Network interface configuration +- Calico CNI plugin installation +- Pod network CIDR setup + +**Action needed:** +- Network configuration script (runs before cluster-detect) +- Calico manifest deployment + +### 3. Cluster Bootstrapping 🟡 +**Priority: HIGH** + +First-time cluster initialization: +- etcd cluster formation (multi-master) +- Kubernetes join tokens for workers +- Ceph monitor quorum setup +- Ceph OSD initialization with devices +- Kafka cluster ID generation + +**Action needed:** +- Bootstrap orchestration script +- First-master vs additional-master detection +- Worker join logic + +### 4. ISO Builder 🟡 +**Priority: HIGH** + +Package everything into bootable image: +- Base Fedora/Rocky Linux +- Install all binaries (kubelet, kafka, ceph, etc.) +- Embed configs/ directory +- Install systemd units +- Install scripts to /usr/local/bin/ + +**Action needed:** +- Kickstart/Anaconda integration +- Image builder script (lorax/mkosi) +- Binary download and packaging + +### 5. Post-Install Persistence 🟢 +**Priority: MEDIUM** + +After detection, persist configuration: +- Save detected identity to disk +- Prevent re-detection on reboot +- Handle re-detection on hardware change + +**Action needed:** +- Already partially implemented +- Needs testing and hardening + +## Testing Status + +| Component | Unit Tests | Integration Tests | E2E Tests | +|-----------|------------|-------------------|-----------| +| Configuration Validation | ✅ Pass | N/A | N/A | +| Node Detection | ⏳ Manual | ❌ Not done | ❌ Not done | +| Role Activation | ⏳ Manual | ❌ Not done | ❌ Not done | +| Service Units | ❌ Not done | ❌ Not done | ❌ Not done | +| Full Boot | ❌ Not done | ❌ Not done | ❌ Not done | + +## Development Roadmap + +### Phase 1: Make it Boot (Current → Week 2) +- [ ] Certificate generation scripts +- [ ] Network configuration +- [ ] Basic Kubernetes cluster formation +- [ ] ISO builder (basic version) +- [ ] VM testing + +### Phase 2: Make it Work (Week 3-4) +- [ ] Ceph cluster initialization +- [ ] Kafka cluster setup +- [ ] Multi-master support +- [ ] Worker join automation +- [ ] End-to-end testing + +### Phase 3: Make it Production-Ready (Week 5-8) +- [ ] Monitoring integration +- [ ] Logging aggregation +- [ ] Update mechanism +- [ ] Backup and restore +- [ ] Security hardening +- [ ] Documentation + +## Current Limitations + +1. **No actual cluster bootstrap** - Services won't start without certs/config +2. **Single master only** - Multi-master etcd not configured +3. **No CNI** - Pod networking won't work +4. **Manual certificate creation** - Must be done out of band +5. **No ISO builder** - Can't create bootable image yet +6. **No network setup** - Assumes pre-configured networking +7. **Ceph incomplete** - Monitor/OSD init are stubs +8. **No secrets management** - Everything in plain text + +## How to Test Locally + +### Validate Configuration +```bash +python3 tools/validate-config.py configs/ +``` + +### Test Node Detection (Dry Run) +```bash +export CONFIG_DIR=$(pwd)/configs +sudo tools/cluster-detect.sh +# Will attempt MAC/IP detection, fall back to interactive +``` + +### Inspect Generated Service Files +```bash +ls -la systemd/ +cat systemd/kubelet.service +cat systemd/kubernetes-master.target +``` + +### Review Configuration Generators +```bash +ls -la tools/*-generator.sh +cat tools/kafka-config-generator.sh +``` + +## Next Session Goals + +Recommend tackling in this order: + +1. **Certificate Generation** (2-3 hours) + - Write script to generate Kubernetes PKI + - Store certs in /etc/kubernetes/pki/ + - Add to cluster-detect flow + +2. **Network Configuration** (1-2 hours) + - Script to set static IP from cluster.yaml + - Configure network interfaces + - Test on VM + +3. **Basic ISO Builder** (3-4 hours) + - Download Fedora netboot + - Create kickstart file + - Package configs and scripts + - Build test ISO + +4. **VM Testing** (2-3 hours) + - Boot test ISO in VM + - Verify detection works + - Check service startup + - Debug issues + +## Questions for Consideration + +1. **Certificate strategy**: Generate at build time or first boot? +2. **Multi-master**: How to handle etcd cluster formation? +3. **Secrets**: Use Vault, sealed-secrets, or simple encryption? +4. **Updates**: In-place or blue-green deployment? +5. **Monitoring**: Integrated or separate cluster? + +## Conclusion + +**The foundation is solid.** We have: +- ✅ Complete configuration system +- ✅ Automatic node detection +- ✅ Role-based service activation +- ✅ All systemd units defined +- ✅ Service configuration generators + +**Next critical steps:** +1. Certificate generation +2. Network setup +3. ISO builder +4. Test in VMs + +The project is well-positioned to become a working prototype with 8-16 more hours of focused development. + +--- + +**Want to continue?** Recommend starting with certificate generation scripts next. diff --git a/config-schema.md b/config-schema.md new file mode 100644 index 0000000..d03d1a4 --- /dev/null +++ b/config-schema.md @@ -0,0 +1,242 @@ +# Configuration Schema Design + +## Overview +The configuration system uses YAML files organized in a hierarchical structure. Configurations are split between: +- **Cluster-level config**: Global settings, network topology, service defaults +- **Node-level config**: Per-node settings, roles, and service overrides + +## Directory Structure +``` +configs/ +├── cluster.yaml # Cluster-wide configuration +├── services/ # Service-specific configurations +│ ├── kubernetes.yaml +│ ├── ceph.yaml +│ ├── kafka.yaml +│ ├── mqtt.yaml +│ └── dns.yaml +└── nodes/ # Per-node configurations + ├── master-01.yaml + ├── worker-01.yaml + ├── kafka-01.yaml + └── ... +``` + +## Cluster Configuration (cluster.yaml) + +```yaml +cluster: + name: "production-cluster" + domain: "cluster.local" + +network: + pod_cidr: "10.244.0.0/16" + service_cidr: "10.96.0.0/12" + dns_servers: + - "10.96.0.10" + +nodes: + # List of all nodes in the cluster + - name: "master-01" + hostname: "master-01.cluster.local" + ip: "192.168.1.10" + roles: ["master", "control-plane"] + + - name: "worker-01" + hostname: "worker-01.cluster.local" + ip: "192.168.1.20" + roles: ["worker"] + + - name: "kafka-01" + hostname: "kafka-01.cluster.local" + ip: "192.168.1.30" + roles: ["worker", "kafka-broker"] + + - name: "ceph-01" + hostname: "ceph-01.cluster.local" + ip: "192.168.1.40" + roles: ["worker", "ceph-osd", "ceph-mon"] + +services: + # Which services are enabled cluster-wide + enabled: + - kubernetes + - ceph + - kafka + - mqtt + - dns +``` + +## Node Configuration (nodes/{node-name}.yaml) + +```yaml +node: + name: "master-01" + roles: + - "master" + - "control-plane" + + # Node-specific overrides + hostname: "master-01.cluster.local" + ip: "192.168.1.10" + + # Hardware/resource hints + resources: + cpu_cores: 8 + memory_gb: 32 + storage_gb: 500 + +# Services to run on this node +services: + kubernetes: + enabled: true + type: "master" + components: + - "kube-apiserver" + - "kube-controller-manager" + - "kube-scheduler" + - "etcd" + + ceph: + enabled: false + + kafka: + enabled: false + + mqtt: + enabled: false + + dns: + enabled: true + type: "coredns" +``` + +## Service Configuration (services/kubernetes.yaml) + +```yaml +service: + name: "kubernetes" + version: "1.28" + +# Service-specific configuration +config: + api_server: + port: 6443 + bind_address: "0.0.0.0" + + kubelet: + cgroup_driver: "systemd" + container_runtime: "containerd" + + network_plugin: "calico" + + feature_gates: + - "EphemeralContainers=true" + +# Systemd unit configuration +systemd: + unit_file: "kubelet.service" + wants: + - "containerd.service" + after: + - "containerd.service" + - "network-online.target" +``` + +## Role Definitions + +### Predefined Roles +- **master**: Kubernetes control plane node +- **worker**: Kubernetes worker node +- **kafka-broker**: Kafka message broker +- **kafka-controller**: Kafka controller (KRaft mode) +- **ceph-mon**: Ceph monitor daemon +- **ceph-osd**: Ceph object storage daemon +- **ceph-mds**: Ceph metadata server +- **mqtt-broker**: MQTT message broker +- **dns-server**: DNS server + +### Custom Roles +Users can define custom roles by creating role definition files in `roles/` directory. + +## Configuration Validation Rules + +1. Each node must have at least one role +2. At least one node must have the "master" role +3. Service configurations must match enabled services +4. IP addresses must be unique across nodes +5. Node names must be valid DNS names +6. Required service dependencies must be met + +## Single-ISO Deployment Model + +This system uses a **single bootable ISO** that can be installed on any node in the cluster. Node identity is detected automatically at first boot. + +### ISO Contents +The ISO contains configurations for the **entire cluster**: +``` +/etc/cluster-config/ +├── cluster.yaml # Full cluster topology (all nodes) +├── services/ # All service configs +│ ├── kubernetes.yaml +│ ├── ceph.yaml +│ ├── kafka.yaml +│ ├── mqtt.yaml +│ └── dns.yaml +└── nodes/ # Configs for every node in cluster + ├── master-01.yaml + ├── worker-01.yaml + ├── kafka-01.yaml + ├── storage-01.yaml + └── ... +``` + +### Boot-time Configuration Resolution (First Boot) + +1. **System boots** from the ISO +2. **Very early in boot**: `cluster-detect.service` starts (before other services) +3. **Node detection** (`cluster-detect.sh`): + - Try to identify node by **MAC address** (compare against `hardware.mac_addresses` in node configs) + - Fallback to **IP address** detection (if static IP or DHCP reservation) + - Fallback to **hostname** detection + - Final fallback: **Interactive prompt** on console asking user to select node identity +4. **Once identified**: + - Create symlink: `/etc/cluster-config/current-node.yaml` → `/etc/cluster-config/nodes/{detected-node}.yaml` + - Write `/etc/cluster-config/node-identity` with node name +5. **Role activation** (`cluster-activate-roles.sh`): + - Read roles from `current-node.yaml` + - Map roles to systemd targets: + - `master` → `kubernetes-master.target` + - `worker` → `kubernetes-worker.target` + - `kafka-broker` → `kafka.target` + - `ceph-osd` → `ceph-osd.target` + - etc. + - Enable and start appropriate targets +6. **Service startup**: + - Systemd targets pull in their service units + - Services read configs from `/etc/cluster-config/services/` and `/etc/cluster-config/current-node.yaml` + - Services start in dependency order + +### Normal Boot (Subsequent Boots) + +1. System boots +2. `cluster-detect.service` runs but finds existing `/etc/cluster-config/node-identity` +3. Skips detection, proceeds to activate saved roles +4. Services start normally based on persisted systemd target enablement + +## Implementation Status + +- ✅ Configuration schema defined +- ✅ Configuration validator tool (`tools/validate-config.py`) +- ✅ Node detection script (`tools/cluster-detect.sh`) +- ✅ Role activation script (`tools/cluster-activate-roles.sh`) +- ✅ Environment file generator (`tools/generate-environment-files.sh`) +- ✅ Systemd service units and targets (19 units total) +- ✅ Service unit files (containerd, kubelet, kube-apiserver, etcd, kafka, ceph, mqtt, coredns) +- ✅ Service configuration generators (8 scripts) +- ⏳ Certificate/key generation (Kubernetes PKI, Ceph keys) +- ⏳ Network configuration on boot +- ⏳ ISO builder tool +- ⏳ Cluster bootstrapping (multi-master, join tokens) + +See [IMPLEMENTATION.md](IMPLEMENTATION.md) for complete architecture overview. diff --git a/configs/cluster.yaml b/configs/cluster.yaml new file mode 100644 index 0000000..5c7bdac --- /dev/null +++ b/configs/cluster.yaml @@ -0,0 +1,54 @@ +cluster: + name: "homelab-cluster" + domain: "cluster.local" + version: "1.0.0" + +network: + pod_cidr: "10.244.0.0/16" + service_cidr: "10.96.0.0/12" + dns_servers: + - "10.96.0.10" + +nodes: + - name: "master-01" + hostname: "master-01.cluster.local" + ip: "192.168.1.10" + roles: + - "master" + - "control-plane" + + - name: "worker-01" + hostname: "worker-01.cluster.local" + ip: "192.168.1.20" + roles: + - "worker" + + - name: "worker-02" + hostname: "worker-02.cluster.local" + ip: "192.168.1.21" + roles: + - "worker" + - "ceph-osd" + + - name: "kafka-01" + hostname: "kafka-01.cluster.local" + ip: "192.168.1.30" + roles: + - "worker" + - "kafka-broker" + + - name: "storage-01" + hostname: "storage-01.cluster.local" + ip: "192.168.1.40" + roles: + - "worker" + - "ceph-mon" + - "ceph-osd" + +services: + enabled: + - kubernetes + - ceph + - kafka + - mqtt + - dns diff --git a/configs/nodes/kafka-01.yaml b/configs/nodes/kafka-01.yaml new file mode 100644 index 0000000..2d139dd --- /dev/null +++ b/configs/nodes/kafka-01.yaml @@ -0,0 +1,40 @@ +node: + name: "kafka-01" + hostname: "kafka-01.cluster.local" + ip: "192.168.1.30" + + roles: + - "worker" + - "kafka-broker" + + hardware: + mac_addresses: + - "52:54:00:12:34:30" + + resources: + cpu_cores: 8 + memory_gb: 32 + storage_gb: 2000 + + # Node-specific overrides + kafka_broker_id: 1 + +services: + kubernetes: + enabled: true + type: "worker" + components: + - "kubelet" + + ceph: + enabled: false + + kafka: + enabled: true + broker_id: 1 + + mqtt: + enabled: false + + dns: + enabled: false diff --git a/configs/nodes/master-01.yaml b/configs/nodes/master-01.yaml new file mode 100644 index 0000000..bc1ce9c --- /dev/null +++ b/configs/nodes/master-01.yaml @@ -0,0 +1,42 @@ +node: + name: "master-01" + hostname: "master-01.cluster.local" + ip: "192.168.1.10" + + roles: + - "master" + - "control-plane" + + # Hardware identifiers for auto-detection + hardware: + mac_addresses: + - "52:54:00:12:34:10" + # Could also use: serial_number, cpu_id, etc. + + resources: + cpu_cores: 8 + memory_gb: 32 + storage_gb: 500 + +services: + kubernetes: + enabled: true + type: "master" + components: + - "kube-apiserver" + - "kube-controller-manager" + - "kube-scheduler" + - "kubelet" + - "etcd" + + ceph: + enabled: false + + kafka: + enabled: false + + mqtt: + enabled: false + + dns: + enabled: true diff --git a/configs/nodes/storage-01.yaml b/configs/nodes/storage-01.yaml new file mode 100644 index 0000000..7ad51e8 --- /dev/null +++ b/configs/nodes/storage-01.yaml @@ -0,0 +1,50 @@ +node: + name: "storage-01" + hostname: "storage-01.cluster.local" + ip: "192.168.1.40" + + roles: + - "worker" + - "ceph-mon" + - "ceph-osd" + + hardware: + mac_addresses: + - "52:54:00:12:34:40" + + resources: + cpu_cores: 8 + memory_gb: 32 + storage_gb: 4000 + + # Ceph-specific configuration + ceph_devices: + - "/dev/sdb" + - "/dev/sdc" + - "/dev/sdd" + +services: + kubernetes: + enabled: true + type: "worker" + components: + - "kubelet" + + ceph: + enabled: true + components: + - "mon" + - "osd" + osd_devices: + - "/dev/sdb" + - "/dev/sdc" + - "/dev/sdd" + + kafka: + enabled: false + + mqtt: + enabled: false + + dns: + enabled: false diff --git a/configs/nodes/worker-01.yaml b/configs/nodes/worker-01.yaml new file mode 100644 index 0000000..41575c5 --- /dev/null +++ b/configs/nodes/worker-01.yaml @@ -0,0 +1,35 @@ +node: + name: "worker-01" + hostname: "worker-01.cluster.local" + ip: "192.168.1.20" + + roles: + - "worker" + + hardware: + mac_addresses: + - "52:54:00:12:34:20" + + resources: + cpu_cores: 16 + memory_gb: 64 + storage_gb: 1000 + +services: + kubernetes: + enabled: true + type: "worker" + components: + - "kubelet" + + ceph: + enabled: false + + kafka: + enabled: false + + mqtt: + enabled: false + + dns: + enabled: false diff --git a/configs/nodes/worker-02.yaml b/configs/nodes/worker-02.yaml new file mode 100644 index 0000000..1cd8cce --- /dev/null +++ b/configs/nodes/worker-02.yaml @@ -0,0 +1,44 @@ +node: + name: "worker-02" + hostname: "worker-02.cluster.local" + ip: "192.168.1.21" + + roles: + - "worker" + - "ceph-osd" + + hardware: + mac_addresses: + - "52:54:00:12:34:21" + + resources: + cpu_cores: 16 + memory_gb: 64 + storage_gb: 2000 + + # Ceph OSD devices + ceph_devices: + - "/dev/sdb" + +services: + kubernetes: + enabled: true + type: "worker" + components: + - "kubelet" + + ceph: + enabled: true + components: + - "osd" + osd_devices: + - "/dev/sdb" + + kafka: + enabled: false + + mqtt: + enabled: false + + dns: + enabled: false diff --git a/configs/services/ceph.yaml b/configs/services/ceph.yaml new file mode 100644 index 0000000..bf7ac14 --- /dev/null +++ b/configs/services/ceph.yaml @@ -0,0 +1,38 @@ +service: + name: "ceph" + version: "17.2.6" # Quincy + description: "Ceph distributed storage system" + +config: + cluster_name: "ceph" + fsid: "{{ cluster.ceph_fsid }}" # Generated UUID for cluster + + global: + mon_host: "192.168.1.40" + auth_cluster_required: "cephx" + auth_service_required: "cephx" + auth_client_required: "cephx" + public_network: "192.168.1.0/24" + cluster_network: "192.168.1.0/24" + + mon: + mon_allow_pool_delete: false + mon_max_pg_per_osd: 300 + + osd: + osd_pool_default_size: 3 + osd_pool_default_min_size: 2 + osd_pool_default_pg_num: 128 + osd_journal_size: 10240 + + mds: + mds_cache_memory_limit: 4294967296 + +systemd: + mon_unit_file: "ceph-mon@.service" + osd_unit_file: "ceph-osd@.service" + mds_unit_file: "ceph-mds@.service" + after: + - "network-online.target" + restart_policy: "on-failure" + restart_sec: 30 diff --git a/configs/services/dns.yaml b/configs/services/dns.yaml new file mode 100644 index 0000000..db16937 --- /dev/null +++ b/configs/services/dns.yaml @@ -0,0 +1,44 @@ +service: + name: "dns" + version: "1.11.1" + description: "CoreDNS for Kubernetes cluster DNS" + implementation: "coredns" + +config: + bind_address: "10.96.0.10" + port: 53 + + zones: + - name: "cluster.local" + type: "kubernetes" + + - name: "." + type: "forward" + forward_to: + - "8.8.8.8" + - "8.8.4.4" + + plugins: + - "errors" + - "health" + - "ready" + - "kubernetes" + - "prometheus" + - "forward" + - "cache" + - "loop" + - "reload" + - "loadbalance" + + cache: + ttl: 30 + max_size: 10000 + +systemd: + unit_file: "coredns.service" + requires: + - "network-online.target" + after: + - "network-online.target" + restart_policy: "always" + restart_sec: 5 diff --git a/configs/services/kafka.yaml b/configs/services/kafka.yaml new file mode 100644 index 0000000..1b1bece --- /dev/null +++ b/configs/services/kafka.yaml @@ -0,0 +1,50 @@ +service: + name: "kafka" + version: "3.6.0" + description: "Apache Kafka distributed event streaming platform" + +config: + # KRaft mode (no Zookeeper) + mode: "kraft" + + cluster_id: "{{ cluster.kafka_cluster_id }}" + + broker: + broker_id: "{{ node.kafka_broker_id }}" + listeners: "PLAINTEXT://{{ node.ip }}:9092,CONTROLLER://{{ node.ip }}:9093" + advertised_listeners: "PLAINTEXT://{{ node.ip }}:9092" + controller_listener_names: "CONTROLLER" + + log_dirs: "/var/lib/kafka/logs" + num_partitions: 3 + default_replication_factor: 3 + min_insync_replicas: 2 + + log_retention_hours: 168 + log_retention_bytes: 1073741824 + log_segment_bytes: 1073741824 + + auto_create_topics_enable: false + delete_topic_enable: true + + controller: + quorum_voters: "1@192.168.1.30:9093" + + performance: + num_network_threads: 8 + num_io_threads: 8 + socket_send_buffer_bytes: 102400 + socket_receive_buffer_bytes: 102400 + socket_request_max_bytes: 104857600 + +systemd: + unit_file: "kafka.service" + requires: + - "network-online.target" + after: + - "network-online.target" + restart_policy: "always" + restart_sec: 10 + environment: + KAFKA_HEAP_OPTS: "-Xmx2G -Xms2G" + KAFKA_JVM_PERFORMANCE_OPTS: "-XX:+UseG1GC -XX:MaxGCPauseMillis=20" diff --git a/configs/services/kubernetes.yaml b/configs/services/kubernetes.yaml new file mode 100644 index 0000000..197feba --- /dev/null +++ b/configs/services/kubernetes.yaml @@ -0,0 +1,42 @@ +service: + name: "kubernetes" + version: "1.28.0" + description: "Kubernetes container orchestration" + +config: + api_server: + port: 6443 + bind_address: "0.0.0.0" + advertise_address: "{{ node.ip }}" + enable_admission_plugins: + - "NodeRestriction" + - "PodSecurityPolicy" + + kubelet: + cgroup_driver: "systemd" + container_runtime: "containerd" + container_runtime_endpoint: "unix:///run/containerd/containerd.sock" + pod_manifest_path: "/etc/kubernetes/manifests" + + network: + plugin: "calico" + mtu: 1450 + + etcd: + data_dir: "/var/lib/etcd" + listen_client_urls: "https://{{ node.ip }}:2379" + listen_peer_urls: "https://{{ node.ip }}:2380" + + feature_gates: + - "EphemeralContainers=true" + - "CSINodeExpandSecret=true" + +systemd: + unit_file: "kubelet.service" + requires: + - "containerd.service" + after: + - "containerd.service" + - "network-online.target" + restart_policy: "always" + restart_sec: 10 diff --git a/configs/services/mqtt.yaml b/configs/services/mqtt.yaml new file mode 100644 index 0000000..5ff7686 --- /dev/null +++ b/configs/services/mqtt.yaml @@ -0,0 +1,38 @@ +service: + name: "mqtt" + version: "2.0.18" + description: "Mosquitto MQTT message broker" + implementation: "mosquitto" + +config: + listener: + port: 1883 + bind_address: "{{ node.ip }}" + protocol: "mqtt" + max_connections: 10000 + + persistence: + enabled: true + location: "/var/lib/mosquitto" + autosave_interval: 300 + + logging: + log_type: "all" + log_dest: "syslog" + log_timestamp: true + + security: + allow_anonymous: false + password_file: "/etc/mosquitto/passwd" + acl_file: "/etc/mosquitto/acl" + + bridge: + # Optional: bridge to other MQTT brokers + enabled: false + +systemd: + unit_file: "mosquitto.service" + after: + - "network-online.target" + restart_policy: "always" + restart_sec: 5 @@ -0,0 +1,26 @@ +# Spawning a Kubernetes Cluster from SystemD + +##### Overview: + the idea of this program is to establish a linux operating system that basically boots into being one functioning node of a kubernetes cluster. it uses the configs in its disk storage to know its node label/category, if it's a master, a kafka coordinator, etc. + the user configures each system component, down the applications running in each OCI container, generates an iso, installs it to each node of the cluster, boots, ...??, profit! + +## First-time boot: (install) + + * write filesystem (which daemons to run, given the host (cluster node). configs for the various daemons. etc + +## Normal boot: + +#### start the following systemd services + - kubernetes + - ceph (or other distributed storage system) + - dns + - kafka + - mqtt + - + - +#### service-specific behavior + - kubernetes + - data-pipeline services (e.g. use kafka messaging, springboot, rabbitmq, nginx) + - monitoring, logging, tracing, observability + - ceph + diff --git a/systemd/ceph-mon.target b/systemd/ceph-mon.target new file mode 100644 index 0000000..9697b9b --- /dev/null +++ b/systemd/ceph-mon.target @@ -0,0 +1,11 @@ +[Unit] +Description=Ceph Monitor Node +Documentation=https://docs.ceph.com/ +Requires=network-online.target +After=network-online.target cluster-detect.service + +# Ceph monitor service (instance will be determined by node name) +Wants=ceph-mon@.service + +[Install] +WantedBy=multi-user.target diff --git a/systemd/ceph-mon@.service b/systemd/ceph-mon@.service new file mode 100644 index 0000000..ac471ec --- /dev/null +++ b/systemd/ceph-mon@.service @@ -0,0 +1,31 @@ +[Unit] +Description=Ceph Monitor daemon (mon.%i) +Documentation=https://docs.ceph.com/ +PartOf=ceph-mon.target +After=network-online.target local-fs.target time-sync.target cluster-detect.service +Wants=network-online.target local-fs.target time-sync.target + +[Service] +Type=notify +EnvironmentFile=/etc/cluster-config/environment/ceph.env +ExecStartPre=/usr/local/bin/ceph-mon-init.sh %i +ExecStart=/usr/bin/ceph-mon -f --cluster ceph --id %i --setuser ceph --setgroup ceph +ExecReload=/bin/kill -HUP $MAINPID + +# Resource management +LimitNOFILE=1048576 +LimitNPROC=1048576 + +Restart=on-failure +RestartSec=10 +StartLimitInterval=30min +StartLimitBurst=3 + +# Security +NoNewPrivileges=true +ProtectHome=true +ProtectSystem=full +PrivateTmp=true + +[Install] +WantedBy=ceph-mon.target diff --git a/systemd/ceph-osd.target b/systemd/ceph-osd.target new file mode 100644 index 0000000..79c5353 --- /dev/null +++ b/systemd/ceph-osd.target @@ -0,0 +1,11 @@ +[Unit] +Description=Ceph OSD Node +Documentation=https://docs.ceph.com/ +Requires=network-online.target +After=network-online.target cluster-detect.service + +# OSD services will be started per-device +# Wants=ceph-osd@0.service (dynamically added based on node config) + +[Install] +WantedBy=multi-user.target diff --git a/systemd/ceph-osd@.service b/systemd/ceph-osd@.service new file mode 100644 index 0000000..27c52e3 --- /dev/null +++ b/systemd/ceph-osd@.service @@ -0,0 +1,31 @@ +[Unit] +Description=Ceph OSD daemon (osd.%i) +Documentation=https://docs.ceph.com/ +PartOf=ceph-osd.target +After=network-online.target local-fs.target time-sync.target cluster-detect.service +Wants=network-online.target local-fs.target time-sync.target + +[Service] +Type=notify +EnvironmentFile=/etc/cluster-config/environment/ceph.env +ExecStartPre=/usr/local/bin/ceph-osd-init.sh %i +ExecStart=/usr/bin/ceph-osd -f --cluster ceph --id %i --setuser ceph --setgroup ceph +ExecStartPost=/usr/bin/ceph osd crush create-or-move -- %i ${OSD_WEIGHT} root=default host=$(hostname -s) + +# Resource management +LimitNOFILE=1048576 +LimitNPROC=1048576 + +Restart=on-failure +RestartSec=10 +StartLimitInterval=30min +StartLimitBurst=5 + +# Security +NoNewPrivileges=true +ProtectHome=true +ProtectSystem=full +PrivateTmp=true + +[Install] +WantedBy=ceph-osd.target diff --git a/systemd/cluster-detect.service b/systemd/cluster-detect.service new file mode 100644 index 0000000..b9d85c4 --- /dev/null +++ b/systemd/cluster-detect.service @@ -0,0 +1,33 @@ +[Unit] +Description=Cluster Node Identity Detection +Documentation=man:cluster-detect(8) +# Must run very early, before any cluster services +DefaultDependencies=no +After=local-fs.target +Before=network-pre.target sysinit.target +Wants=local-fs.target + +[Service] +Type=oneshot +RemainAfterExit=yes + +# Configuration directory (will be /etc/cluster-config on installed system) +Environment=CONFIG_DIR=/etc/cluster-config + +ExecStart=/usr/local/bin/cluster-detect.sh + +# Logging +StandardOutput=journal +StandardError=journal +SyslogIdentifier=cluster-detect + +# Security hardening +# (Relaxed for now since it needs to modify /etc/cluster-config) +NoNewPrivileges=true +ProtectHome=true +ProtectKernelTunables=true +ProtectKernelModules=true +ProtectControlGroups=true + +[Install] +WantedBy=sysinit.target diff --git a/systemd/containerd.service b/systemd/containerd.service new file mode 100644 index 0000000..6d31694 --- /dev/null +++ b/systemd/containerd.service @@ -0,0 +1,31 @@ +[Unit] +Description=containerd container runtime +Documentation=https://containerd.io +After=network.target local-fs.target + +[Service] +Type=notify +ExecStartPre=-/sbin/modprobe overlay +ExecStart=/usr/bin/containerd + +Restart=always +RestartSec=5 + +# Having non-zero Limit*s causes performance problems due to accounting overhead +# in the kernel. We recommend using cgroups to do container-local accounting. +LimitNPROC=infinity +LimitCORE=infinity +LimitNOFILE=infinity + +# Comment TasksMax if your systemd version does not support it. +# Only systemd 226 and above support this option. +TasksMax=infinity + +# Set delegate yes so that systemd does not reset the cgroups of docker containers +Delegate=yes + +# Kill only the containerd process, not all processes in the cgroup +KillMode=process + +[Install] +WantedBy=multi-user.target diff --git a/systemd/coredns.service b/systemd/coredns.service new file mode 100644 index 0000000..5bb725d --- /dev/null +++ b/systemd/coredns.service @@ -0,0 +1,31 @@ +[Unit] +Description=CoreDNS DNS server +Documentation=https://coredns.io/manual/toc/ +After=network-online.target kubernetes-master.target cluster-detect.service +Wants=network-online.target + +[Service] +Type=simple +User=coredns +Group=coredns +EnvironmentFile=/etc/cluster-config/environment/coredns.env +ExecStartPre=/usr/local/bin/coredns-config-generator.sh +ExecStart=/usr/bin/coredns -conf /etc/coredns/Corefile +ExecReload=/bin/kill -SIGUSR1 $MAINPID + +Restart=always +RestartSec=5 + +# Security +CapabilityBoundingSet=CAP_NET_BIND_SERVICE +AmbientCapabilities=CAP_NET_BIND_SERVICE +NoNewPrivileges=true +ProtectHome=true +ProtectSystem=strict +ReadWritePaths=/var/lib/coredns +PrivateTmp=true + +LimitNOFILE=8192 + +[Install] +WantedBy=dns.target diff --git a/systemd/dns.target b/systemd/dns.target new file mode 100644 index 0000000..37c874f --- /dev/null +++ b/systemd/dns.target @@ -0,0 +1,10 @@ +[Unit] +Description=Cluster DNS Server +Documentation=https://coredns.io/ +Requires=network-online.target +After=network-online.target cluster-detect.service kubernetes-master.target + +Wants=coredns.service + +[Install] +WantedBy=multi-user.target diff --git a/systemd/etcd.service b/systemd/etcd.service new file mode 100644 index 0000000..831d3eb --- /dev/null +++ b/systemd/etcd.service @@ -0,0 +1,45 @@ +[Unit] +Description=etcd key-value store +Documentation=https://etcd.io/docs/ +After=network.target cluster-detect.service +Before=kube-apiserver.service + +[Service] +Type=notify +EnvironmentFile=/etc/cluster-config/environment/etcd.env +ExecStartPre=/usr/local/bin/etcd-config-generator.sh +ExecStart=/usr/bin/etcd \ + --name=${ETCD_NAME} \ + --data-dir=/var/lib/etcd \ + --listen-client-urls=https://${NODE_IP}:2379,https://127.0.0.1:2379 \ + --advertise-client-urls=https://${NODE_IP}:2379 \ + --listen-peer-urls=https://${NODE_IP}:2380 \ + --initial-advertise-peer-urls=https://${NODE_IP}:2380 \ + --initial-cluster=${ETCD_INITIAL_CLUSTER} \ + --initial-cluster-token=etcd-cluster \ + --initial-cluster-state=new \ + --cert-file=/etc/kubernetes/pki/etcd/server.crt \ + --key-file=/etc/kubernetes/pki/etcd/server.key \ + --peer-cert-file=/etc/kubernetes/pki/etcd/peer.crt \ + --peer-key-file=/etc/kubernetes/pki/etcd/peer.key \ + --trusted-ca-file=/etc/kubernetes/pki/etcd/ca.crt \ + --peer-trusted-ca-file=/etc/kubernetes/pki/etcd/ca.crt \ + --peer-client-cert-auth \ + --client-cert-auth \ + --snapshot-count=10000 \ + --heartbeat-interval=100 \ + --election-timeout=1000 + +Restart=always +RestartSec=10 + +# Security +NoNewPrivileges=true +ProtectHome=true +ProtectSystem=strict +ReadWritePaths=/var/lib/etcd + +LimitNOFILE=65536 + +[Install] +WantedBy=kubernetes-master.target diff --git a/systemd/kafka.service b/systemd/kafka.service new file mode 100644 index 0000000..c6fe3e9 --- /dev/null +++ b/systemd/kafka.service @@ -0,0 +1,34 @@ +[Unit] +Description=Apache Kafka Broker (KRaft mode) +Documentation=https://kafka.apache.org/documentation/ +After=network-online.target cluster-detect.service +Wants=network-online.target + +[Service] +Type=simple +User=kafka +Group=kafka +EnvironmentFile=/etc/cluster-config/environment/kafka.env +Environment="KAFKA_HEAP_OPTS=-Xmx2G -Xms2G" +Environment="KAFKA_JVM_PERFORMANCE_OPTS=-XX:+UseG1GC -XX:MaxGCPauseMillis=20 -XX:InitiatingHeapOccupancyPercent=35 -XX:G1HeapRegionSize=16M -XX:MinMetaspaceFreeRatio=50 -XX:MaxMetaspaceFreeRatio=80" + +ExecStartPre=/usr/local/bin/kafka-config-generator.sh +ExecStart=/opt/kafka/bin/kafka-server-start.sh /var/lib/kafka/server.properties + +# Graceful shutdown +TimeoutStopSec=180 +SuccessExitStatus=143 + +Restart=always +RestartSec=10 + +# Security +NoNewPrivileges=true +ProtectHome=true +ProtectSystem=strict +ReadWritePaths=/var/lib/kafka /var/log/kafka + +LimitNOFILE=100000 + +[Install] +WantedBy=kafka.target diff --git a/systemd/kafka.target b/systemd/kafka.target new file mode 100644 index 0000000..ea8eb43 --- /dev/null +++ b/systemd/kafka.target @@ -0,0 +1,10 @@ +[Unit] +Description=Apache Kafka Broker +Documentation=https://kafka.apache.org/documentation/ +Requires=network-online.target +After=network-online.target cluster-detect.service + +Wants=kafka.service + +[Install] +WantedBy=multi-user.target diff --git a/systemd/kube-apiserver.service b/systemd/kube-apiserver.service new file mode 100644 index 0000000..7e4f2c6 --- /dev/null +++ b/systemd/kube-apiserver.service @@ -0,0 +1,46 @@ +[Unit] +Description=Kubernetes API Server +Documentation=https://kubernetes.io/docs/reference/command-line-tools-reference/kube-apiserver/ +After=network.target etcd.service cluster-detect.service +Wants=etcd.service + +[Service] +Type=notify +EnvironmentFile=/etc/cluster-config/environment/kube-apiserver.env +ExecStartPre=/usr/local/bin/kube-apiserver-config-generator.sh +ExecStart=/usr/bin/kube-apiserver \ + --advertise-address=${NODE_IP} \ + --allow-privileged=true \ + --authorization-mode=Node,RBAC \ + --client-ca-file=/etc/kubernetes/pki/ca.crt \ + --enable-admission-plugins=NodeRestriction \ + --enable-bootstrap-token-auth=true \ + --etcd-servers=https://127.0.0.1:2379 \ + --etcd-cafile=/etc/kubernetes/pki/etcd/ca.crt \ + --etcd-certfile=/etc/kubernetes/pki/apiserver-etcd-client.crt \ + --etcd-keyfile=/etc/kubernetes/pki/apiserver-etcd-client.key \ + --kubelet-client-certificate=/etc/kubernetes/pki/apiserver-kubelet-client.crt \ + --kubelet-client-key=/etc/kubernetes/pki/apiserver-kubelet-client.key \ + --kubelet-preferred-address-types=InternalIP,ExternalIP,Hostname \ + --proxy-client-cert-file=/etc/kubernetes/pki/front-proxy-client.crt \ + --proxy-client-key-file=/etc/kubernetes/pki/front-proxy-client.key \ + --requestheader-allowed-names=front-proxy-client \ + --requestheader-client-ca-file=/etc/kubernetes/pki/front-proxy-ca.crt \ + --requestheader-extra-headers-prefix=X-Remote-Extra- \ + --requestheader-group-headers=X-Remote-Group \ + --requestheader-username-headers=X-Remote-User \ + --secure-port=6443 \ + --service-account-issuer=https://kubernetes.default.svc.cluster.local \ + --service-account-key-file=/etc/kubernetes/pki/sa.pub \ + --service-account-signing-key-file=/etc/kubernetes/pki/sa.key \ + --service-cluster-ip-range=${SERVICE_CIDR} \ + --tls-cert-file=/etc/kubernetes/pki/apiserver.crt \ + --tls-private-key-file=/etc/kubernetes/pki/apiserver.key + +Restart=always +RestartSec=10 + +LimitNOFILE=65536 + +[Install] +WantedBy=kubernetes-master.target diff --git a/systemd/kube-controller-manager.service b/systemd/kube-controller-manager.service new file mode 100644 index 0000000..d3a54ed --- /dev/null +++ b/systemd/kube-controller-manager.service @@ -0,0 +1,33 @@ +[Unit] +Description=Kubernetes Controller Manager +Documentation=https://kubernetes.io/docs/reference/command-line-tools-reference/kube-controller-manager/ +After=kube-apiserver.service +Wants=kube-apiserver.service + +[Service] +Type=notify +EnvironmentFile=/etc/cluster-config/environment/kube-controller-manager.env +ExecStart=/usr/bin/kube-controller-manager \ + --allocate-node-cidrs=true \ + --authentication-kubeconfig=/etc/kubernetes/controller-manager.conf \ + --authorization-kubeconfig=/etc/kubernetes/controller-manager.conf \ + --bind-address=127.0.0.1 \ + --client-ca-file=/etc/kubernetes/pki/ca.crt \ + --cluster-cidr=${POD_CIDR} \ + --cluster-name=kubernetes \ + --cluster-signing-cert-file=/etc/kubernetes/pki/ca.crt \ + --cluster-signing-key-file=/etc/kubernetes/pki/ca.key \ + --controllers=*,bootstrapsigner,tokencleaner \ + --kubeconfig=/etc/kubernetes/controller-manager.conf \ + --leader-elect=true \ + --requestheader-client-ca-file=/etc/kubernetes/pki/front-proxy-ca.crt \ + --root-ca-file=/etc/kubernetes/pki/ca.crt \ + --service-account-private-key-file=/etc/kubernetes/pki/sa.key \ + --service-cluster-ip-range=${SERVICE_CIDR} \ + --use-service-account-credentials=true + +Restart=always +RestartSec=10 + +[Install] +WantedBy=kubernetes-master.target diff --git a/systemd/kube-scheduler.service b/systemd/kube-scheduler.service new file mode 100644 index 0000000..d2c575c --- /dev/null +++ b/systemd/kube-scheduler.service @@ -0,0 +1,20 @@ +[Unit] +Description=Kubernetes Scheduler +Documentation=https://kubernetes.io/docs/reference/command-line-tools-reference/kube-scheduler/ +After=kube-apiserver.service +Wants=kube-apiserver.service + +[Service] +Type=notify +ExecStart=/usr/bin/kube-scheduler \ + --authentication-kubeconfig=/etc/kubernetes/scheduler.conf \ + --authorization-kubeconfig=/etc/kubernetes/scheduler.conf \ + --bind-address=127.0.0.1 \ + --kubeconfig=/etc/kubernetes/scheduler.conf \ + --leader-elect=true + +Restart=always +RestartSec=10 + +[Install] +WantedBy=kubernetes-master.target diff --git a/systemd/kubelet.service b/systemd/kubelet.service new file mode 100644 index 0000000..46be849 --- /dev/null +++ b/systemd/kubelet.service @@ -0,0 +1,29 @@ +[Unit] +Description=Kubernetes Kubelet +Documentation=https://kubernetes.io/docs/concepts/overview/components/#kubelet +After=containerd.service network-online.target cluster-detect.service +Requires=containerd.service +Wants=network-online.target + +[Service] +Type=notify +EnvironmentFile=/etc/cluster-config/environment/kubelet.env +ExecStartPre=/usr/local/bin/kubelet-config-generator.sh +ExecStart=/usr/bin/kubelet \ + --config=/var/lib/kubelet/config.yaml \ + --container-runtime-endpoint=unix:///run/containerd/containerd.sock \ + --kubeconfig=/etc/kubernetes/kubelet.conf \ + --node-ip=${NODE_IP} + +Restart=always +RestartSec=10 + +# Resource limits +LimitNOFILE=65536 +LimitNPROC=4096 + +# Security +NoNewPrivileges=false + +[Install] +WantedBy=kubernetes-master.target kubernetes-worker.target diff --git a/systemd/kubernetes-master.target b/systemd/kubernetes-master.target new file mode 100644 index 0000000..ebb024d --- /dev/null +++ b/systemd/kubernetes-master.target @@ -0,0 +1,16 @@ +[Unit] +Description=Kubernetes Master/Control Plane Node +Documentation=https://kubernetes.io/docs/ +Requires=network-online.target +After=network-online.target cluster-detect.service +Wants=containerd.service + +# Master components +Wants=kubelet.service +Wants=kube-apiserver.service +Wants=kube-controller-manager.service +Wants=kube-scheduler.service +Wants=etcd.service + +[Install] +WantedBy=multi-user.target diff --git a/systemd/kubernetes-worker.target b/systemd/kubernetes-worker.target new file mode 100644 index 0000000..59ccefc --- /dev/null +++ b/systemd/kubernetes-worker.target @@ -0,0 +1,12 @@ +[Unit] +Description=Kubernetes Worker Node +Documentation=https://kubernetes.io/docs/ +Requires=network-online.target +After=network-online.target cluster-detect.service +Wants=containerd.service + +# Worker components +Wants=kubelet.service + +[Install] +WantedBy=multi-user.target diff --git a/systemd/mosquitto.service b/systemd/mosquitto.service new file mode 100644 index 0000000..2eff1d4 --- /dev/null +++ b/systemd/mosquitto.service @@ -0,0 +1,28 @@ +[Unit] +Description=Mosquitto MQTT Broker +Documentation=man:mosquitto.conf(5) man:mosquitto(8) +After=network-online.target cluster-detect.service +Wants=network-online.target + +[Service] +Type=notify +NotifyAccess=main +User=mosquitto +Group=mosquitto +EnvironmentFile=/etc/cluster-config/environment/mqtt.env +ExecStartPre=/usr/local/bin/mosquitto-config-generator.sh +ExecStart=/usr/sbin/mosquitto -c /etc/mosquitto/mosquitto.conf +ExecReload=/bin/kill -HUP $MAINPID + +Restart=always +RestartSec=5 + +# Security +NoNewPrivileges=true +ProtectHome=true +ProtectSystem=strict +ReadWritePaths=/var/lib/mosquitto +PrivateTmp=true + +[Install] +WantedBy=mqtt.target diff --git a/systemd/mqtt.target b/systemd/mqtt.target new file mode 100644 index 0000000..6396402 --- /dev/null +++ b/systemd/mqtt.target @@ -0,0 +1,10 @@ +[Unit] +Description=MQTT Message Broker +Documentation=https://mosquitto.org/ +Requires=network-online.target +After=network-online.target cluster-detect.service + +Wants=mosquitto.service + +[Install] +WantedBy=multi-user.target diff --git a/tools/ceph-mon-init.sh b/tools/ceph-mon-init.sh new file mode 100755 index 0000000..af24ac5 --- /dev/null +++ b/tools/ceph-mon-init.sh @@ -0,0 +1,31 @@ +#!/bin/bash +# +# Initialize Ceph Monitor +# + +set -euo pipefail + +MON_ID="$1" +MON_DIR="/var/lib/ceph/mon/ceph-${MON_ID}" + +# Create monitor directory +mkdir -p "$MON_DIR" + +# Check if already initialized +if [ -f "$MON_DIR/done" ]; then + echo "Ceph monitor $MON_ID already initialized" + exit 0 +fi + +echo "TODO: Initialize Ceph monitor $MON_ID" +echo "This requires:" +echo " - Cluster FSID" +echo " - Monitor map" +echo " - Monitor keyring" +echo " - Admin keyring" + +# For now, just create the directory +chown -R ceph:ceph "$MON_DIR" 2>/dev/null || true + +# Mark as initialized (TODO: remove when actual init is implemented) +# touch "$MON_DIR/done" diff --git a/tools/ceph-osd-init.sh b/tools/ceph-osd-init.sh new file mode 100755 index 0000000..97ae412 --- /dev/null +++ b/tools/ceph-osd-init.sh @@ -0,0 +1,31 @@ +#!/bin/bash +# +# Initialize Ceph OSD +# + +set -euo pipefail + +OSD_ID="$1" +OSD_DIR="/var/lib/ceph/osd/ceph-${OSD_ID}" + +# Create OSD directory +mkdir -p "$OSD_DIR" + +# Check if already initialized +if [ -f "$OSD_DIR/ready" ]; then + echo "Ceph OSD $OSD_ID already initialized" + exit 0 +fi + +echo "TODO: Initialize Ceph OSD $OSD_ID" +echo "This requires:" +echo " - Device preparation" +echo " - OSD keyring" +echo " - OSD filesystem creation" +echo " - OSD ID assignment" + +# For now, just create the directory +chown -R ceph:ceph "$OSD_DIR" 2>/dev/null || true + +# Mark as initialized (TODO: remove when actual init is implemented) +# touch "$OSD_DIR/ready" diff --git a/tools/cluster-activate-roles.sh b/tools/cluster-activate-roles.sh new file mode 100755 index 0000000..671c854 --- /dev/null +++ b/tools/cluster-activate-roles.sh @@ -0,0 +1,121 @@ +#!/bin/bash +# +# Cluster Role Activation Script +# +# Reads the detected node configuration and enables appropriate systemd +# targets based on the node's roles. +# +# Called by cluster-detect.service after node detection is complete. +# + +set -euo pipefail + +CONFIG_DIR="${CONFIG_DIR:-/etc/cluster-config}" +CURRENT_NODE_FILE="$CONFIG_DIR/current-node.yaml" +NODE_IDENTITY_FILE="$CONFIG_DIR/node-identity" + +log() { + echo "[cluster-activate] $*" | systemd-cat -t cluster-activate -p info + echo "[cluster-activate] $*" +} + +error() { + echo "[cluster-activate] ERROR: $*" | systemd-cat -t cluster-activate -p err + echo "[cluster-activate] ERROR: $*" >&2 +} + +# Extract roles from node config (simple grep-based YAML parser) +get_roles() { + grep -A 20 "roles:" "$CURRENT_NODE_FILE" | grep -E '^\s+- ' | sed 's/.*- "\?\([^"]*\)"\?/\1/' +} + +# Map roles to systemd targets +role_to_target() { + local role="$1" + + case "$role" in + master|control-plane) + echo "kubernetes-master.target" + ;; + worker) + echo "kubernetes-worker.target" + ;; + kafka-broker|kafka-controller) + echo "kafka.target" + ;; + ceph-mon) + echo "ceph-mon.target" + ;; + ceph-osd) + echo "ceph-osd.target" + ;; + ceph-mds) + echo "ceph-mds.target" + ;; + mqtt-broker) + echo "mqtt.target" + ;; + dns-server) + echo "dns.target" + ;; + *) + log "Unknown role: $role" + echo "" + ;; + esac +} + +main() { + log "Activating systemd targets based on node roles..." + + if [ ! -f "$CURRENT_NODE_FILE" ]; then + error "Current node config not found: $CURRENT_NODE_FILE" + exit 1 + fi + + if [ ! -f "$NODE_IDENTITY_FILE" ]; then + error "Node identity file not found: $NODE_IDENTITY_FILE" + exit 1 + fi + + local node_name=$(cat "$NODE_IDENTITY_FILE") + log "Node identity: $node_name" + + local roles=$(get_roles) + + if [ -z "$roles" ]; then + error "No roles found in node configuration" + exit 1 + fi + + log "Node roles: $(echo $roles | tr '\n' ' ')" + + # Enable and start targets for each role + local targets=() + + for role in $roles; do + local target=$(role_to_target "$role") + + if [ -n "$target" ]; then + targets+=("$target") + log "Role '$role' -> $target" + fi + done + + # Enable targets (persist across reboots) + for target in "${targets[@]}"; do + log "Enabling $target..." + systemctl enable "$target" || log "Warning: failed to enable $target" + done + + # Start targets + for target in "${targets[@]}"; do + log "Starting $target..." + systemctl start "$target" || log "Warning: failed to start $target" + done + + log "Role activation complete" + exit 0 +} + +main "$@" diff --git a/tools/cluster-detect.sh b/tools/cluster-detect.sh new file mode 100755 index 0000000..2b77e63 --- /dev/null +++ b/tools/cluster-detect.sh @@ -0,0 +1,262 @@ +#!/bin/bash +# +# Cluster Node Detection Script +# +# This script identifies which node this system is by comparing hardware +# identifiers (MAC addresses, IP addresses, etc.) against the cluster config. +# +# Executed by cluster-detect.service early in the boot process. +# + +set -euo pipefail + +CONFIG_DIR="${CONFIG_DIR:-/etc/cluster-config}" +CLUSTER_CONFIG="$CONFIG_DIR/cluster.yaml" +NODES_DIR="$CONFIG_DIR/nodes" +CURRENT_NODE_FILE="$CONFIG_DIR/current-node.yaml" +NODE_IDENTITY_FILE="$CONFIG_DIR/node-identity" + +log() { + echo "[cluster-detect] $*" | systemd-cat -t cluster-detect -p info + echo "[cluster-detect] $*" +} + +error() { + echo "[cluster-detect] ERROR: $*" | systemd-cat -t cluster-detect -p err + echo "[cluster-detect] ERROR: $*" >&2 +} + +# Get all MAC addresses on this system +get_mac_addresses() { + ip link show | grep -E 'link/ether' | awk '{print $2}' | sort +} + +# Get primary IP address (first non-loopback) +get_ip_address() { + ip -4 addr show | grep -oP '(?<=inet\s)\d+(\.\d+){3}' | grep -v '^127\.' | head -1 +} + +# Get hostname +get_hostname() { + hostname +} + +# Parse YAML to extract MAC addresses for a node config +# This is a simple grep-based parser - for production use a proper YAML parser +get_node_macs() { + local node_file="$1" + grep -A 10 "mac_addresses:" "$node_file" | grep -E '^\s+- ' | sed 's/.*- "\(.*\)"/\1/' | sed 's/.*- \(.*\)/\1/' +} + +# Get node IP from config +get_node_ip() { + local node_file="$1" + grep "ip:" "$node_file" | head -1 | sed 's/.*ip: *"\?\([^"]*\)"\?/\1/' +} + +# Get node hostname from config +get_node_hostname() { + local node_file="$1" + grep "hostname:" "$node_file" | head -1 | sed 's/.*hostname: *"\?\([^"]*\)"\?/\1/' +} + +# Detect node by MAC address +detect_by_mac() { + log "Attempting detection by MAC address..." + + local system_macs=$(get_mac_addresses) + log "System MAC addresses: $(echo $system_macs | tr '\n' ' ')" + + for node_file in "$NODES_DIR"/*.yaml; do + [ -f "$node_file" ] || continue + + local node_name=$(basename "$node_file" .yaml) + local node_macs=$(get_node_macs "$node_file") + + # Check if any system MAC matches any node MAC + for sys_mac in $system_macs; do + for node_mac in $node_macs; do + if [ "$sys_mac" = "$node_mac" ]; then + log "Matched MAC $sys_mac to node $node_name" + echo "$node_name" + return 0 + fi + done + done + done + + return 1 +} + +# Detect node by IP address +detect_by_ip() { + log "Attempting detection by IP address..." + + local system_ip=$(get_ip_address) + + if [ -z "$system_ip" ]; then + log "No IP address assigned yet" + return 1 + fi + + log "System IP address: $system_ip" + + for node_file in "$NODES_DIR"/*.yaml; do + [ -f "$node_file" ] || continue + + local node_name=$(basename "$node_file" .yaml) + local node_ip=$(get_node_ip "$node_file") + + if [ "$system_ip" = "$node_ip" ]; then + log "Matched IP $system_ip to node $node_name" + echo "$node_name" + return 0 + fi + done + + return 1 +} + +# Detect node by hostname +detect_by_hostname() { + log "Attempting detection by hostname..." + + local system_hostname=$(get_hostname) + log "System hostname: $system_hostname" + + for node_file in "$NODES_DIR"/*.yaml; do + [ -f "$node_file" ] || continue + + local node_name=$(basename "$node_file" .yaml) + local node_hostname=$(get_node_hostname "$node_file") + + # Match either the full hostname or just the node name + if [ "$system_hostname" = "$node_hostname" ] || [ "$system_hostname" = "$node_name" ]; then + log "Matched hostname $system_hostname to node $node_name" + echo "$node_name" + return 0 + fi + done + + return 1 +} + +# Interactive selection (fallback) +interactive_select() { + log "Automatic detection failed, requiring interactive selection" + + echo + echo "=====================================" + echo " Cluster Node Identity Selection" + echo "=====================================" + echo + echo "Could not automatically detect which node this is." + echo "Please select from the available nodes:" + echo + + local nodes=() + local i=1 + + for node_file in "$NODES_DIR"/*.yaml; do + [ -f "$node_file" ] || continue + local node_name=$(basename "$node_file" .yaml) + nodes+=("$node_name") + echo " $i) $node_name" + i=$((i + 1)) + done + + echo + read -p "Enter number (1-${#nodes[@]}): " selection + + if [ "$selection" -ge 1 ] && [ "$selection" -le "${#nodes[@]}" ]; then + local selected_node="${nodes[$((selection - 1))]}" + log "User selected node: $selected_node" + echo "$selected_node" + return 0 + else + error "Invalid selection" + return 1 + fi +} + +# Main detection logic +detect_node() { + local detected_node="" + + # Try detection methods in order + detected_node=$(detect_by_mac) || \ + detected_node=$(detect_by_ip) || \ + detected_node=$(detect_by_hostname) || \ + detected_node=$(interactive_select) + + if [ -n "$detected_node" ]; then + echo "$detected_node" + return 0 + else + error "Failed to detect node identity" + return 1 + fi +} + +# Main +main() { + log "Starting cluster node detection..." + + # Check if already detected + if [ -f "$NODE_IDENTITY_FILE" ]; then + local existing_node=$(cat "$NODE_IDENTITY_FILE") + log "Node already identified as: $existing_node" + log "Skipping detection (remove $NODE_IDENTITY_FILE to re-detect)" + exit 0 + fi + + # Validate config directory exists + if [ ! -d "$CONFIG_DIR" ]; then + error "Config directory not found: $CONFIG_DIR" + exit 1 + fi + + if [ ! -d "$NODES_DIR" ]; then + error "Nodes directory not found: $NODES_DIR" + exit 1 + fi + + # Detect node + detected_node=$(detect_node) + + if [ -z "$detected_node" ]; then + error "Node detection failed" + exit 1 + fi + + log "Detected node: $detected_node" + + # Create symlink or copy to current-node.yaml + local node_config="$NODES_DIR/$detected_node.yaml" + + if [ ! -f "$node_config" ]; then + error "Node config not found: $node_config" + exit 1 + fi + + log "Linking $node_config -> $CURRENT_NODE_FILE" + ln -sf "$node_config" "$CURRENT_NODE_FILE" + + # Write identity file + echo "$detected_node" > "$NODE_IDENTITY_FILE" + + log "Node detection complete: $detected_node" + log "Config available at: $CURRENT_NODE_FILE" + + # Generate environment files for services + log "Generating environment files..." + /usr/local/bin/generate-environment-files.sh || log "Warning: environment file generation failed" + + # Activate systemd targets based on node roles + log "Activating node roles..." + /usr/local/bin/cluster-activate-roles.sh || log "Warning: role activation script failed" + + exit 0 +} + +main "$@" diff --git a/tools/coredns-config-generator.sh b/tools/coredns-config-generator.sh new file mode 100755 index 0000000..d61d879 --- /dev/null +++ b/tools/coredns-config-generator.sh @@ -0,0 +1,51 @@ +#!/bin/bash +# +# Generate CoreDNS Corefile configuration +# + +set -euo pipefail + +CONFIG_DIR="${CONFIG_DIR:-/etc/cluster-config}" +COREDNS_DIR="/etc/coredns" +COREFILE="$COREDNS_DIR/Corefile" + +mkdir -p "$COREDNS_DIR" + +# Generate Corefile +cat > "$COREFILE" <<'EOF' +# CoreDNS Configuration for Kubernetes + +cluster.local:53 { + errors + health { + lameduck 5s + } + ready + kubernetes cluster.local in-addr.arpa ip6.arpa { + pods insecure + fallthrough in-addr.arpa ip6.arpa + ttl 30 + } + prometheus :9153 + forward . /etc/resolv.conf { + max_concurrent 1000 + } + cache 30 + loop + reload + loadbalance +} + +.:53 { + errors + health + ready + forward . 8.8.8.8 8.8.4.4 + cache 30 + reload +} +EOF + +chown -R coredns:coredns "$COREDNS_DIR" 2>/dev/null || true + +echo "CoreDNS configuration generated at $COREFILE" diff --git a/tools/etcd-config-generator.sh b/tools/etcd-config-generator.sh new file mode 100755 index 0000000..896f97b --- /dev/null +++ b/tools/etcd-config-generator.sh @@ -0,0 +1,17 @@ +#!/bin/bash +# +# Initialize etcd data directory and configuration +# + +set -euo pipefail + +ETCD_DATA_DIR="/var/lib/etcd" + +# Create data directory +mkdir -p "$ETCD_DATA_DIR" +chown -R etcd:etcd "$ETCD_DATA_DIR" 2>/dev/null || true + +# Ensure PKI exists +mkdir -p /etc/kubernetes/pki/etcd + +echo "etcd initialization complete" diff --git a/tools/generate-environment-files.sh b/tools/generate-environment-files.sh new file mode 100755 index 0000000..5135061 --- /dev/null +++ b/tools/generate-environment-files.sh @@ -0,0 +1,174 @@ +#!/bin/bash +# +# Generate Environment Files for Services +# +# Reads the cluster configuration and node configuration to generate +# environment files used by systemd services. +# + +set -euo pipefail + +CONFIG_DIR="${CONFIG_DIR:-/etc/cluster-config}" +ENV_DIR="/etc/cluster-config/environment" +CURRENT_NODE="$CONFIG_DIR/current-node.yaml" +CLUSTER_CONFIG="$CONFIG_DIR/cluster.yaml" + +log() { + echo "[env-generator] $*" | systemd-cat -t env-generator -p info + echo "[env-generator] $*" +} + +error() { + echo "[env-generator] ERROR: $*" | systemd-cat -t env-generator -p err + echo "[env-generator] ERROR: $*" >&2 +} + +# Create environment directory +mkdir -p "$ENV_DIR" + +# Extract values from YAML (simple grep-based parser) +get_value() { + local file="$1" + local key="$2" + grep "^[[:space:]]*${key}:" "$file" | head -1 | sed "s/.*${key}: *[\"']\?\([^\"']*\)[\"']\?/\1/" +} + +get_node_ip() { + get_value "$CURRENT_NODE" "ip" +} + +get_node_name() { + cat "$CONFIG_DIR/node-identity" +} + +get_cluster_value() { + local key="$1" + grep -A 5 "^${key}:" "$CLUSTER_CONFIG" | tail -1 | sed 's/.*"\(.*\)"/\1/' | xargs +} + +# Generate kubelet environment +generate_kubelet_env() { + local node_ip=$(get_node_ip) + + cat > "$ENV_DIR/kubelet.env" <<EOF +NODE_IP=$node_ip +KUBELET_EXTRA_ARGS= +EOF + + log "Generated kubelet.env" +} + +# Generate kube-apiserver environment +generate_apiserver_env() { + local node_ip=$(get_node_ip) + local service_cidr=$(get_cluster_value "service_cidr") + + cat > "$ENV_DIR/kube-apiserver.env" <<EOF +NODE_IP=$node_ip +SERVICE_CIDR=${service_cidr:-10.96.0.0/12} +EOF + + log "Generated kube-apiserver.env" +} + +# Generate kube-controller-manager environment +generate_controller_env() { + local pod_cidr=$(get_cluster_value "pod_cidr") + local service_cidr=$(get_cluster_value "service_cidr") + + cat > "$ENV_DIR/kube-controller-manager.env" <<EOF +POD_CIDR=${pod_cidr:-10.244.0.0/16} +SERVICE_CIDR=${service_cidr:-10.96.0.0/12} +EOF + + log "Generated kube-controller-manager.env" +} + +# Generate etcd environment +generate_etcd_env() { + local node_ip=$(get_node_ip) + local node_name=$(get_node_name) + + # For now, single-node etcd. TODO: support multi-master + local etcd_cluster="${node_name}=https://${node_ip}:2380" + + cat > "$ENV_DIR/etcd.env" <<EOF +ETCD_NAME=$node_name +NODE_IP=$node_ip +ETCD_INITIAL_CLUSTER=$etcd_cluster +EOF + + log "Generated etcd.env" +} + +# Generate Kafka environment +generate_kafka_env() { + local node_ip=$(get_node_ip) + local node_name=$(get_node_name) + + # Extract broker ID from node config (or use hash of hostname) + local broker_id=$(grep "kafka_broker_id:" "$CURRENT_NODE" | sed 's/.*: //' || echo "1") + + cat > "$ENV_DIR/kafka.env" <<EOF +NODE_IP=$node_ip +NODE_NAME=$node_name +KAFKA_BROKER_ID=$broker_id +EOF + + log "Generated kafka.env" +} + +# Generate Ceph environment +generate_ceph_env() { + local node_name=$(get_node_name) + + cat > "$ENV_DIR/ceph.env" <<EOF +NODE_NAME=$node_name +OSD_WEIGHT=1.0 +EOF + + log "Generated ceph.env" +} + +# Generate MQTT environment +generate_mqtt_env() { + local node_ip=$(get_node_ip) + + cat > "$ENV_DIR/mqtt.env" <<EOF +NODE_IP=$node_ip +EOF + + log "Generated mqtt.env" +} + +# Generate CoreDNS environment +generate_coredns_env() { + cat > "$ENV_DIR/coredns.env" <<EOF +CLUSTER_DOMAIN=cluster.local +EOF + + log "Generated coredns.env" +} + +main() { + log "Generating environment files..." + + if [ ! -f "$CURRENT_NODE" ]; then + error "Current node config not found: $CURRENT_NODE" + exit 1 + fi + + # Generate all environment files + generate_kubelet_env + generate_apiserver_env + generate_controller_env + generate_etcd_env + generate_kafka_env + generate_ceph_env + generate_mqtt_env + generate_coredns_env + + log "Environment file generation complete" +} + +main "$@" diff --git a/tools/kafka-config-generator.sh b/tools/kafka-config-generator.sh new file mode 100755 index 0000000..a89df6e --- /dev/null +++ b/tools/kafka-config-generator.sh @@ -0,0 +1,62 @@ +#!/bin/bash +# +# Generate Kafka server.properties from cluster config +# + +set -euo pipefail + +CONFIG_DIR="${CONFIG_DIR:-/etc/cluster-config}" +KAFKA_DIR="/var/lib/kafka" +KAFKA_CONFIG="$KAFKA_DIR/server.properties" + +# Create Kafka directories +mkdir -p "$KAFKA_DIR/logs" +mkdir -p "$(dirname $KAFKA_CONFIG)" + +# Source environment variables +if [ -f /etc/cluster-config/environment/kafka.env ]; then + source /etc/cluster-config/environment/kafka.env +fi + +# Read from service config +SERVICE_CONFIG="$CONFIG_DIR/services/kafka.yaml" + +# Generate server.properties +cat > "$KAFKA_CONFIG" <<EOF +# Server Basics +process.roles=broker,controller +node.id=${KAFKA_BROKER_ID:-1} +controller.quorum.voters=1@${NODE_IP:-localhost}:9093 + +# Socket Server Settings +listeners=PLAINTEXT://${NODE_IP:-0.0.0.0}:9092,CONTROLLER://${NODE_IP:-0.0.0.0}:9093 +advertised.listeners=PLAINTEXT://${NODE_IP:-localhost}:9092 +controller.listener.names=CONTROLLER +inter.broker.listener.name=PLAINTEXT + +# Log Basics +log.dirs=$KAFKA_DIR/logs +num.partitions=3 +default.replication.factor=3 +min.insync.replicas=2 + +# Log Retention +log.retention.hours=168 +log.retention.bytes=1073741824 +log.segment.bytes=1073741824 + +# Cluster ID (should be generated once for the cluster) +cluster.id=MkU3OEVBNTcwNTJENDM2Qk + +# Performance tuning +num.network.threads=8 +num.io.threads=8 +socket.send.buffer.bytes=102400 +socket.receive.buffer.bytes=102400 +socket.request.max.bytes=104857600 +EOF + +# Set ownership +chown -R kafka:kafka "$KAFKA_DIR" 2>/dev/null || true + +echo "Kafka configuration generated at $KAFKA_CONFIG" diff --git a/tools/kube-apiserver-config-generator.sh b/tools/kube-apiserver-config-generator.sh new file mode 100755 index 0000000..a6bdbbd --- /dev/null +++ b/tools/kube-apiserver-config-generator.sh @@ -0,0 +1,26 @@ +#!/bin/bash +# +# Pre-start checks for kube-apiserver +# + +set -euo pipefail + +# Ensure PKI directory exists +mkdir -p /etc/kubernetes/pki/etcd + +# Check for required certificates (TODO: generate if missing) +REQUIRED_CERTS=( + "/etc/kubernetes/pki/ca.crt" + "/etc/kubernetes/pki/ca.key" + "/etc/kubernetes/pki/apiserver.crt" + "/etc/kubernetes/pki/apiserver.key" +) + +for cert in "${REQUIRED_CERTS[@]}"; do + if [ ! -f "$cert" ]; then + echo "WARNING: Certificate not found: $cert" + echo "TODO: Implement certificate generation" + fi +done + +echo "kube-apiserver pre-start checks complete" diff --git a/tools/kubelet-config-generator.sh b/tools/kubelet-config-generator.sh new file mode 100755 index 0000000..5b1f624 --- /dev/null +++ b/tools/kubelet-config-generator.sh @@ -0,0 +1,41 @@ +#!/bin/bash +# +# Generate kubelet configuration from cluster config +# + +set -euo pipefail + +CONFIG_DIR="${CONFIG_DIR:-/etc/cluster-config}" +KUBELET_DIR="/var/lib/kubelet" + +mkdir -p "$KUBELET_DIR" + +# Read service config +SERVICE_CONFIG="$CONFIG_DIR/services/kubernetes.yaml" + +# Extract kubelet config values +CGROUP_DRIVER=$(grep "cgroup_driver:" "$SERVICE_CONFIG" | sed 's/.*: *//') + +# Generate kubelet config.yaml +cat > "$KUBELET_DIR/config.yaml" <<EOF +apiVersion: kubelet.config.k8s.io/v1beta1 +kind: KubeletConfiguration +cgroupDriver: ${CGROUP_DRIVER:-systemd} +authentication: + anonymous: + enabled: false + webhook: + enabled: true + x509: + clientCAFile: /etc/kubernetes/pki/ca.crt +authorization: + mode: Webhook +clusterDomain: cluster.local +clusterDNS: + - 10.96.0.10 +runtimeRequestTimeout: 15m +tlsCertFile: /var/lib/kubelet/pki/kubelet.crt +tlsPrivateKeyFile: /var/lib/kubelet/pki/kubelet.key +EOF + +echo "Kubelet config generated at $KUBELET_DIR/config.yaml" diff --git a/tools/mosquitto-config-generator.sh b/tools/mosquitto-config-generator.sh new file mode 100755 index 0000000..3d9e93a --- /dev/null +++ b/tools/mosquitto-config-generator.sh @@ -0,0 +1,60 @@ +#!/bin/bash +# +# Generate Mosquitto configuration +# + +set -euo pipefail + +CONFIG_DIR="${CONFIG_DIR:-/etc/cluster-config}" +MOSQUITTO_DIR="/etc/mosquitto" +MOSQUITTO_CONFIG="$MOSQUITTO_DIR/mosquitto.conf" + +mkdir -p "$MOSQUITTO_DIR" + +# Source environment +if [ -f /etc/cluster-config/environment/mqtt.env ]; then + source /etc/cluster-config/environment/mqtt.env +fi + +# Read service config +SERVICE_CONFIG="$CONFIG_DIR/services/mqtt.yaml" + +# Generate mosquitto.conf +cat > "$MOSQUITTO_CONFIG" <<EOF +# Mosquitto MQTT Broker Configuration + +# Listener +listener 1883 ${NODE_IP:-0.0.0.0} +protocol mqtt + +# Persistence +persistence true +persistence_location /var/lib/mosquitto/ +autosave_interval 300 + +# Logging +log_dest syslog +log_type all +log_timestamp true +log_timestamp_format %Y-%m-%dT%H:%M:%S + +# Security +allow_anonymous false +password_file $MOSQUITTO_DIR/passwd +acl_file $MOSQUITTO_DIR/acl + +# Connection limits +max_connections 10000 + +# Performance +max_queued_messages 1000 +max_inflight_messages 100 +EOF + +# Create empty passwd and acl files if they don't exist +touch "$MOSQUITTO_DIR/passwd" +touch "$MOSQUITTO_DIR/acl" + +chown -R mosquitto:mosquitto "$MOSQUITTO_DIR" 2>/dev/null || true + +echo "Mosquitto configuration generated at $MOSQUITTO_CONFIG" diff --git a/tools/validate-config.py b/tools/validate-config.py new file mode 100755 index 0000000..475eba2 --- /dev/null +++ b/tools/validate-config.py @@ -0,0 +1,293 @@ +#!/usr/bin/env python3 +""" +Cluster Configuration Validator + +Validates cluster, node, and service configuration files to ensure: +- Valid YAML syntax +- Required fields are present +- No duplicate IPs, MACs, or node names +- Service dependencies are met +- At least one master node exists +- Role definitions are valid +""" + +import sys +import yaml +import os +from pathlib import Path +from typing import Dict, List, Set, Any +from collections import defaultdict + + +class ValidationError(Exception): + """Raised when validation fails""" + pass + + +class ConfigValidator: + def __init__(self, config_dir: str): + self.config_dir = Path(config_dir) + self.errors = [] + self.warnings = [] + + self.cluster_config = None + self.node_configs = {} + self.service_configs = {} + + def load_yaml(self, file_path: Path) -> Dict[str, Any]: + """Load and parse a YAML file""" + try: + with open(file_path, 'r') as f: + return yaml.safe_load(f) + except yaml.YAMLError as e: + raise ValidationError(f"YAML syntax error in {file_path}: {e}") + except FileNotFoundError: + raise ValidationError(f"File not found: {file_path}") + + def add_error(self, message: str): + """Add a validation error""" + self.errors.append(f"ERROR: {message}") + + def add_warning(self, message: str): + """Add a validation warning""" + self.warnings.append(f"WARNING: {message}") + + def validate_cluster_config(self): + """Validate cluster.yaml""" + cluster_file = self.config_dir / "cluster.yaml" + + if not cluster_file.exists(): + self.add_error("cluster.yaml not found") + return + + self.cluster_config = self.load_yaml(cluster_file) + + # Check required top-level keys + required_keys = ['cluster', 'network', 'nodes', 'services'] + for key in required_keys: + if key not in self.cluster_config: + self.add_error(f"cluster.yaml missing required key: {key}") + + if 'cluster' in self.cluster_config: + cluster = self.cluster_config['cluster'] + if 'name' not in cluster: + self.add_error("cluster.name is required") + if 'domain' not in cluster: + self.add_error("cluster.domain is required") + + # Validate nodes list + if 'nodes' in self.cluster_config: + self.validate_cluster_nodes() + + # Validate services + if 'services' in self.cluster_config: + self.validate_cluster_services() + + def validate_cluster_nodes(self): + """Validate nodes list in cluster.yaml""" + nodes = self.cluster_config.get('nodes', []) + + if not nodes: + self.add_error("No nodes defined in cluster.yaml") + return + + seen_ips = set() + seen_names = set() + has_master = False + + for idx, node in enumerate(nodes): + # Check required fields + if 'name' not in node: + self.add_error(f"Node at index {idx} missing 'name'") + continue + + name = node['name'] + + # Check for duplicates + if name in seen_names: + self.add_error(f"Duplicate node name: {name}") + seen_names.add(name) + + if 'ip' not in node: + self.add_error(f"Node {name} missing 'ip'") + else: + ip = node['ip'] + if ip in seen_ips: + self.add_error(f"Duplicate IP address: {ip}") + seen_ips.add(ip) + + if 'roles' not in node: + self.add_error(f"Node {name} missing 'roles'") + else: + roles = node['roles'] + if not roles: + self.add_error(f"Node {name} has empty roles list") + if 'master' in roles or 'control-plane' in roles: + has_master = True + + if not has_master: + self.add_error("Cluster must have at least one master/control-plane node") + + def validate_cluster_services(self): + """Validate services in cluster.yaml""" + services = self.cluster_config.get('services', {}) + + if 'enabled' not in services: + self.add_warning("No enabled services defined in cluster.yaml") + return + + enabled = services['enabled'] + if not enabled: + self.add_warning("No services enabled") + + # Check that service configs exist for enabled services + services_dir = self.config_dir / "services" + for service_name in enabled: + service_file = services_dir / f"{service_name}.yaml" + if not service_file.exists(): + self.add_error(f"Service config not found: {service_file}") + + def validate_node_configs(self): + """Validate all node configuration files""" + nodes_dir = self.config_dir / "nodes" + + if not nodes_dir.exists(): + self.add_error("nodes/ directory not found") + return + + node_files = list(nodes_dir.glob("*.yaml")) + if not node_files: + self.add_warning("No node configuration files found in nodes/") + return + + seen_macs = defaultdict(list) + + for node_file in node_files: + node_config = self.load_yaml(node_file) + node_name = node_file.stem + + self.node_configs[node_name] = node_config + + if 'node' not in node_config: + self.add_error(f"{node_file.name}: missing 'node' section") + continue + + node = node_config['node'] + + # Validate required fields + if 'name' not in node: + self.add_error(f"{node_file.name}: missing node.name") + elif node['name'] != node_name: + self.add_warning(f"{node_file.name}: node.name '{node['name']}' doesn't match filename") + + if 'roles' not in node: + self.add_error(f"{node_file.name}: missing node.roles") + elif not node['roles']: + self.add_error(f"{node_file.name}: node.roles is empty") + + # Check MAC addresses for duplicates + if 'hardware' in node and 'mac_addresses' in node['hardware']: + for mac in node['hardware']['mac_addresses']: + seen_macs[mac].append(node_name) + + # Report duplicate MACs + for mac, nodes in seen_macs.items(): + if len(nodes) > 1: + self.add_error(f"Duplicate MAC address {mac} in nodes: {', '.join(nodes)}") + + # Check that cluster.yaml nodes have corresponding node configs + if self.cluster_config and 'nodes' in self.cluster_config: + cluster_nodes = {n['name'] for n in self.cluster_config['nodes']} + config_nodes = set(self.node_configs.keys()) + + missing = cluster_nodes - config_nodes + extra = config_nodes - cluster_nodes + + if missing: + self.add_error(f"Nodes in cluster.yaml missing node configs: {', '.join(missing)}") + if extra: + self.add_warning(f"Node configs not referenced in cluster.yaml: {', '.join(extra)}") + + def validate_service_configs(self): + """Validate service configuration files""" + services_dir = self.config_dir / "services" + + if not services_dir.exists(): + self.add_error("services/ directory not found") + return + + service_files = list(services_dir.glob("*.yaml")) + if not service_files: + self.add_warning("No service configuration files found") + return + + for service_file in service_files: + service_config = self.load_yaml(service_file) + service_name = service_file.stem + + self.service_configs[service_name] = service_config + + if 'service' not in service_config: + self.add_error(f"{service_file.name}: missing 'service' section") + continue + + service = service_config['service'] + + if 'name' not in service: + self.add_error(f"{service_file.name}: missing service.name") + elif service['name'] != service_name: + self.add_warning(f"{service_file.name}: service.name '{service['name']}' doesn't match filename") + + if 'version' not in service: + self.add_warning(f"{service_file.name}: missing service.version") + + def validate_all(self) -> bool: + """Run all validations""" + print(f"Validating configuration in: {self.config_dir}") + print("=" * 60) + + try: + self.validate_cluster_config() + self.validate_node_configs() + self.validate_service_configs() + except ValidationError as e: + self.add_error(str(e)) + + # Print results + print() + if self.warnings: + print("Warnings:") + for warning in self.warnings: + print(f" {warning}") + print() + + if self.errors: + print("Errors:") + for error in self.errors: + print(f" {error}") + print() + print(f"Validation FAILED with {len(self.errors)} error(s)") + return False + else: + print("✓ Validation PASSED") + if self.warnings: + print(f" ({len(self.warnings)} warning(s))") + return True + + +def main(): + if len(sys.argv) > 1: + config_dir = sys.argv[1] + else: + # Default to configs/ in the same directory as this script + script_dir = Path(__file__).parent.parent + config_dir = script_dir / "configs" + + validator = ConfigValidator(config_dir) + success = validator.validate_all() + + sys.exit(0 if success else 1) + + +if __name__ == "__main__": + main() |
