Quickstart (Python API)¶
Who needs this? Python developers, data scientists, researchers needing custom workflows
What problem does this solve? Integrate FoodSpec into Python scripts, Jupyter notebooks, ML pipelines
When to use this? Fine-grained control, custom preprocessing, integration with existing code
Why it matters? Python API provides maximum flexibility for research and development
Time to complete: 15 minutes
Prerequisites: FoodSpec installed (pip install foodspec), Python 3.10+, basic pandas/numpy knowledge
Installation¶
pip (Recommended)¶
pip install foodspec
conda¶
conda install -c conda-forge foodspec
Verify¶
import foodspec
print(f"FoodSpec {foodspec.__version__}")
Dataset Format¶
Data Format Reference
See Data Format Reference for complete schema specifications, validation checklists, and best practices. Key terms defined in Glossary.
CSV Requirements¶
wavenumber,oil_type,batch,s1,s2,s3
1000.0,olive,A,5.2,5.1,4.8
1010.0,olive,A,5.5,5.3,5.0
1020.0,olive,A,5.8,5.6,5.2
In Python¶
import pandas as pd
import numpy as np
# Load CSV
df = pd.read_csv('oils.csv')
# Required columns:
wavenumbers = df['wavenumber'].values # shape: (150,)
spectra = df.iloc[:, 3:].values # shape: (150, 3) — one spectrum per row
labels = df['oil_type'].values # shape: (3,) — one label per spectrum
Or generate synthetic data for testing¶
import numpy as np
import pandas as pd
np.random.seed(42)
wavenumbers = np.linspace(1000, 3000, 150)
# Generate 10 samples per class
olive_spectra = np.random.normal(5.0, 0.3, (10, 150))
palm_spectra = np.random.normal(6.0, 0.3, (10, 150))
# Combine
X = np.vstack([olive_spectra, palm_spectra]) # shape: (20, 150)
y = ['olive']*10 + ['palm']*10 # shape: (20,)
# Save for later
df = pd.DataFrame(X, columns=[f'{w:.1f}' for w in wavenumbers])
df.insert(0, 'oil_type', y)
df.insert(0, 'wavenumber', wavenumbers)
df.to_csv('oils_demo.csv', index=False)
Complete Example: Oil Classification¶
Step 1: Create synthetic data¶
import numpy as np
import pandas as pd
from pathlib import Path
# Create toy dataset
np.random.seed(42)
wavenumbers = np.linspace(1000, 3000, 150)
# Generate spectra with class-specific patterns
olive = np.random.normal(5.0, 0.3, (15, 150))
palm = np.random.normal(6.0, 0.3, (15, 150))
sunflower = np.random.normal(4.5, 0.3, (15, 150))
# Combine
X = np.vstack([olive, palm, sunflower]) # (45, 150)
y = ['olive']*15 + ['palm']*15 + ['sunflower']*15 # (45,)
# Save
df = pd.DataFrame(X, columns=[f'{w:.1f}' for w in wavenumbers])
df.insert(0, 'oil_type', y)
df.insert(0, 'wavenumber', wavenumbers)
df.to_csv('oils_demo.csv', index=False)
print(f"✓ Created oils_demo.csv: {X.shape[0]} spectra, {X.shape[1]} wavenumbers")
Step 2: Load and explore¶
from foodspec import SpectralDataset
import matplotlib.pyplot as plt
# Load from CSV
ds = SpectralDataset.from_csv(
'oils_demo.csv',
wavenumber_col='wavenumber',
label_col='oil_type'
)
print(f"Loaded {len(ds)} spectra")
print(f"Wavenumber range: {ds.wavenumbers[0]:.1f}–{ds.wavenumbers[-1]:.1f} cm⁻¹")
print(f"Classes: {set(ds.labels)}")
# Visualize raw spectra
fig, ax = plt.subplots(figsize=(10, 4))
for label in set(ds.labels):
idx = ds.labels == label
ax.plot(ds.wavenumbers, ds.x[idx].mean(axis=0), label=label, linewidth=2)
ax.set_xlabel('Wavenumber (cm⁻¹)')
ax.set_ylabel('Intensity')
ax.legend()
plt.tight_layout()
plt.savefig('spectra_raw.png', dpi=150)
print("✓ Saved spectra_raw.png")
Step 3: Preprocess¶
from foodspec.preprocessing import (
baseline_als,
normalize_vector,
smooth_savitzky_golay
)
# Preprocessing pipeline
X = ds.x.copy()
# 1. Baseline correction (remove instrumental drift)
X = baseline_als(X, lambda_=1e5, p=0.01)
# 2. Savitzky-Golay smoothing (reduce noise)
X = smooth_savitzky_golay(X, window_length=9, polyorder=3)
# 3. Vector normalization (make samples comparable)
X = normalize_vector(X, norm='l2')
print(f"✓ Preprocessed shape: {X.shape}")
# Visualize preprocessed spectra
fig, ax = plt.subplots(figsize=(10, 4))
labels_unique = sorted(set(ds.labels))
for label in labels_unique:
idx = ds.labels == label
ax.plot(ds.wavenumbers, X[idx].mean(axis=0), label=label, linewidth=2)
ax.set_xlabel('Wavenumber (cm⁻¹)')
ax.set_ylabel('Normalized Intensity')
ax.legend()
plt.tight_layout()
plt.savefig('spectra_preprocessed.png', dpi=150)
print("✓ Saved spectra_preprocessed.png")
Step 4: Explore with PCA¶
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
# PCA
pca = PCA(n_components=2)
scores = pca.fit_transform(X)
# Plot
fig, ax = plt.subplots(figsize=(8, 6))
colors = {'olive': 'green', 'palm': 'orange', 'sunflower': 'gold'}
for label in set(ds.labels):
idx = ds.labels == label
ax.scatter(scores[idx, 0], scores[idx, 1], label=label, s=100,
color=colors.get(label, 'blue'), alpha=0.7)
ax.set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%})')
ax.set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%})')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('pca_scores.png', dpi=150)
print(f"✓ Saved pca_scores.png (variance explained: {sum(pca.explained_variance_ratio_):.1%})")
Step 5: Train and evaluate classifier¶
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np
# Create classifier
clf = RandomForestClassifier(n_estimators=50, random_state=42)
# Cross-validation
scores = cross_val_score(clf, X, ds.labels, cv=5, scoring='balanced_accuracy')
print(f"Cross-validation balanced accuracy: {scores.mean():.3f} (±{scores.std():.3f})")
# Train on all data (for demo)
clf.fit(X, ds.labels)
# Predictions
y_pred = clf.predict(X)
# Print metrics
print("\nClassification report:")
print(classification_report(ds.labels, y_pred))
# Confusion matrix
cm = confusion_matrix(ds.labels, y_pred)
print(f"\nConfusion matrix:\n{cm}")
Step 6: Save model and results¶
import json
import pickle
# Save model
with open('model.pkl', 'wb') as f:
pickle.dump(clf, f)
print("✓ Saved model.pkl")
# Save metadata
metadata = {
'model': 'RandomForestClassifier',
'n_samples': len(ds),
'n_features': X.shape[1],
'classes': list(set(ds.labels)),
'cv_accuracy': float(scores.mean()),
'timestamp': pd.Timestamp.now().isoformat()
}
with open('metadata.json', 'w') as f:
json.dump(metadata, f, indent=2)
print("✓ Saved metadata.json")
Real-World Workflow: Oil Authentication¶
Use built-in ratiometric features (RQ engine) for interpretable results:
from foodspec.features.rq import RatioQualityEngine, RQConfig
# Define discriminative ratios
config = RQConfig(
ratios=[
('1742', '2720'), # C=O / C=C (common marker)
('1652', '2720'), # C=C bend / C=C stretch
('1600', '2720'), # Aromatic / aliphatic
]
)
# Compute features
rq = RatioQualityEngine(config=config)
features = rq.compute(X, ds.wavenumbers)
print(f"RQ features shape: {features.shape}")
print(f"Feature names: {rq.feature_names}")
# Train on features instead of raw spectra
clf_rq = RandomForestClassifier(n_estimators=50, random_state=42)
scores_rq = cross_val_score(clf_rq, features, ds.labels, cv=5)
print(f"RQ-based CV accuracy: {scores_rq.mean():.3f}")
Expected Outputs¶
Plots generated¶
- ✅
spectra_raw.png— Raw spectra by class - ✅
spectra_preprocessed.png— Preprocessed spectra - ✅
pca_scores.png— PCA biplot
Files generated¶
- ✅
oils_demo.csv— Synthetic dataset - ✅
model.pkl— Trained classifier - ✅
metadata.json— Run metadata
Console output¶
✓ Created oils_demo.csv: 45 spectra, 150 wavenumbers
Loaded 45 spectra
Wavenumber range: 1000.0–3000.0 cm⁻¹
Classes: {'olive', 'palm', 'sunflower'}
✓ Preprocessed shape: (45, 150)
✓ Saved spectra_raw.png
✓ Saved spectra_preprocessed.png
✓ Saved pca_scores.png (variance explained: 83.5%)
Cross-validation balanced accuracy: 0.967 (±0.049)
Classification report:
precision recall f1-score support
olive 0.93 1.00 0.96 15
palm 1.00 0.93 0.96 15
sunflower 0.93 1.00 0.96 15
accuracy 0.98 45
macro avg 0.95 0.98 0.97 45
weighted avg 0.95 0.98 0.97 45
Additional Resources¶
- Data Format Reference - Schema formats, unit conventions, validation checklist
- Glossary - Definitions of wavenumber, baseline, normalization, CV strategy, etc.
- API Reference - Complete API documentation
- Preprocessing Guide - Detailed preprocessing recipes
Troubleshooting (Top 5 Issues)¶
1️⃣ "ModuleNotFoundError: No module named 'foodspec'"¶
Cause: FoodSpec not installed
Fix:
pip install --upgrade foodspec
python -c "import foodspec; print(foodspec.__version__)"
2️⃣ "ValueError: wavenumber column not found"¶
Cause: Column name doesn't match
Fix:
# Check CSV columns
import pandas as pd
df = pd.read_csv('oils.csv')
print(df.columns)
# Use correct column name
ds = SpectralDataset.from_csv(
'oils.csv',
wavenumber_col='cm-1', # actual column name
label_col='oil_type'
)
3️⃣ "Shape mismatch: expected (n_samples, n_features)"¶
Cause: Spectra are transposed (wavenumbers as rows, samples as columns)
Fix:
# Check shape
print(X.shape) # should be (n_samples, n_wavenumbers)
# If transposed, flip
if X.shape[1] > X.shape[0]:
X = X.T
4️⃣ "All NaN or constant feature"¶
Cause: Bad preprocessing (too aggressive baseline correction)
Fix:
# Check for NaN after preprocessing
import numpy as np
print(f"NaN count: {np.isnan(X).sum()}")
print(f"Const columns: {(X.std(axis=0) == 0).sum()}")
# Use gentler baseline correction
from foodspec.preprocessing import baseline_als
X = baseline_als(X, lambda_=1e4, p=0.1) # less aggressive
# Remove constant columns
X = X[:, X.std(axis=0) > 1e-10]
5️⃣ "Memory error on large dataset"¶
Cause: Loading entire dataset into memory
Fix:
# Process in chunks
import numpy as np
chunk_size = 100
for i in range(0, len(ds), chunk_size):
chunk = ds.x[i:i+chunk_size]
chunk_processed = baseline_als(chunk, lambda_=1e5, p=0.01)
# Process chunk...
# Or subsample for exploration
ds_small = SpectralDataset(
ds.x[::10], # every 10th spectrum
ds.wavenumbers,
ds.labels[::10]
)
Copy-Paste One-Liner (Start Here)¶
# Full pipeline in ~50 lines
import numpy as np; import pandas as pd; from foodspec import SpectralDataset; from foodspec.preprocessing import baseline_als, normalize_vector, smooth_savitzky_golay; from sklearn.ensemble import RandomForestClassifier; from sklearn.model_selection import cross_val_score; np.random.seed(42); w = np.linspace(1000, 3000, 150); X = np.vstack([np.random.normal(5, 0.3, (15, 150)), np.random.normal(6, 0.3, (15, 150))]); y = ['olive']*15 + ['palm']*15; df = pd.DataFrame(X, columns=[f'{x:.1f}' for x in w]); df.insert(0, 'oil_type', y); df.insert(0, 'wavenumber', w); df.to_csv('demo.csv', index=False); ds = SpectralDataset.from_csv('demo.csv', wavenumber_col='wavenumber', label_col='oil_type'); X_prep = baseline_als(ds.x, 1e5, 0.01); X_prep = smooth_savitzky_golay(X_prep, 9, 3); X_prep = normalize_vector(X_prep, 'l2'); clf = RandomForestClassifier(50); scores = cross_val_score(clf, X_prep, ds.labels, cv=5); print(f"✓ CV accuracy: {scores.mean():.3f}")
Next Steps¶
- ✅ Try the CLI version: CLI Quickstart
- ✅ Explore workflows: Oil Authentication
- ✅ Custom preprocessing: Preprocessing Guide
- ✅ Advanced models: ML Guide
Need Help?¶
- Getting errors (NaNs, shape mismatches, overfitting)? → Troubleshooting Guide
- Questions about methods or usage? → FAQ
- Report a bug: GitHub Issues