Skip to content

Commit f84e34d

Browse files
noahgiftclaude
andcommitted
[GREEN] feat(oracle): Add depyler-specific training corpus + model evaluation (Refs GH-105)
Training corpus from actual DEPYLER-0551 through DEPYLER-0555 fixes: - TypeMismatch: 14 samples (serde_json::Value, dict inference) - TraitBound: 10 samples (PathBuf, datetime, hashlib) - MissingImport: 2 samples (chrono mapping) - SyntaxError: 1 sample (return inference) Model evaluation (leave-one-out CV): - Accuracy: 59.26% (16/27) - needs more samples - Throughput: 2478 predictions/sec - Corpus size: 27 samples (target: 100+) Uses aprender metrics for evaluation. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <[email protected]>
1 parent b176b14 commit f84e34d

File tree

3 files changed

+486
-0
lines changed

3 files changed

+486
-0
lines changed
Lines changed: 257 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,257 @@
1+
//! Depyler-specific training data for error classification.
2+
//!
3+
//! Captures error patterns from actual transpilation fixes (DEPYLER-0551 through DEPYLER-0555+).
4+
//! Uses aprender's model evaluation for cross-validation.
5+
6+
use crate::classifier::ErrorCategory;
7+
use crate::training::{TrainingDataset, TrainingSample};
8+
9+
/// Build depyler-specific training dataset from actual fixes.
10+
#[must_use]
11+
pub fn build_depyler_corpus() -> TrainingDataset {
12+
let mut dataset = TrainingDataset::new();
13+
14+
// DEPYLER-0551: Error types + PathBuf methods
15+
add_pathbuf_samples(&mut dataset);
16+
17+
// DEPYLER-0552: Dict access type inference
18+
add_dict_inference_samples(&mut dataset);
19+
20+
// DEPYLER-0553: datetime.datetime chain + instance methods
21+
add_datetime_samples(&mut dataset);
22+
23+
// DEPYLER-0554: Function return type + if/else returns
24+
add_return_type_samples(&mut dataset);
25+
26+
// DEPYLER-0555: hashlib/file read patterns
27+
add_file_io_samples(&mut dataset);
28+
29+
// Type inference: serde_json::Value defaults
30+
add_type_inference_samples(&mut dataset);
31+
32+
dataset
33+
}
34+
35+
fn add_pathbuf_samples(dataset: &mut TrainingDataset) {
36+
dataset.add_many(vec![
37+
TrainingSample::with_fix(
38+
"error[E0599]: no method named `exists` found for type `String`",
39+
ErrorCategory::TraitBound,
40+
"Use std::path::PathBuf::from(&path).exists() instead of String.exists()",
41+
),
42+
TrainingSample::with_fix(
43+
"error[E0599]: no method named `is_file` found for type `String`",
44+
ErrorCategory::TraitBound,
45+
"Convert to PathBuf: std::path::PathBuf::from(&path).is_file()",
46+
),
47+
TrainingSample::with_fix(
48+
"error[E0599]: no method named `stat` found for type `PathBuf`",
49+
ErrorCategory::TraitBound,
50+
"Use path.metadata() instead of path.stat() - Rust uses metadata()",
51+
),
52+
TrainingSample::with_fix(
53+
"error[E0277]: the trait bound `PathBuf: From<Option<String>>` is not satisfied",
54+
ErrorCategory::TypeMismatch,
55+
"Unwrap Option before PathBuf conversion: path.map(PathBuf::from)",
56+
),
57+
]);
58+
}
59+
60+
fn add_dict_inference_samples(dataset: &mut TrainingDataset) {
61+
dataset.add_many(vec![
62+
TrainingSample::with_fix(
63+
"error[E0308]: mismatched types expected `&String`, found `&&serde_json::Value`",
64+
ErrorCategory::TypeMismatch,
65+
"Fix type inference: parameter should be String/&str not serde_json::Value",
66+
),
67+
TrainingSample::with_fix(
68+
"error[E0277]: the trait bound `String: Borrow<&str>` is not satisfied",
69+
ErrorCategory::TraitBound,
70+
"HashMap key type mismatch: use &str or String consistently",
71+
),
72+
TrainingSample::with_fix(
73+
"error[E0599]: no method named `get` found for enum `serde_json::Value`",
74+
ErrorCategory::TypeMismatch,
75+
"Type should be HashMap not Value - fix dict type inference",
76+
),
77+
TrainingSample::with_fix(
78+
"expected `HashMap<String, String>`, found `HashMap<String, serde_json::Value>`",
79+
ErrorCategory::TypeMismatch,
80+
"Dict value type inference: propagate concrete type from usage",
81+
),
82+
]);
83+
}
84+
85+
fn add_datetime_samples(dataset: &mut TrainingDataset) {
86+
dataset.add_many(vec![
87+
TrainingSample::with_fix(
88+
"error[E0433]: failed to resolve: use of undeclared type `DateTime`",
89+
ErrorCategory::MissingImport,
90+
"datetime.datetime.fromtimestamp() → chrono::DateTime::from_timestamp()",
91+
),
92+
TrainingSample::with_fix(
93+
"error[E0599]: no method named `isoformat` found",
94+
ErrorCategory::TraitBound,
95+
"dt.isoformat() → dt.to_string() for chrono DateTime",
96+
),
97+
TrainingSample::with_fix(
98+
"error[E0599]: no method named `strftime` found for struct `NaiveDateTime`",
99+
ErrorCategory::TraitBound,
100+
"dt.strftime(fmt) → dt.format(fmt).to_string() for chrono",
101+
),
102+
TrainingSample::with_fix(
103+
"error[E0599]: no method named `timestamp` found for struct `NaiveDateTime`",
104+
ErrorCategory::TraitBound,
105+
"dt.timestamp() → dt.and_utc().timestamp() as f64",
106+
),
107+
TrainingSample::with_fix(
108+
"error[E0599]: no method named `fromtimestamp`",
109+
ErrorCategory::MissingImport,
110+
"datetime.datetime.fromtimestamp → chrono::DateTime::from_timestamp",
111+
),
112+
]);
113+
}
114+
115+
fn add_return_type_samples(dataset: &mut TrainingDataset) {
116+
dataset.add_many(vec![
117+
TrainingSample::with_fix(
118+
"error[E0308]: mismatched types expected `()`, found `String`",
119+
ErrorCategory::TypeMismatch,
120+
"Function missing return type: infer -> String from if/else branches",
121+
),
122+
TrainingSample::with_fix(
123+
"error[E0308]: mismatched types expected `()`, found `i32`",
124+
ErrorCategory::TypeMismatch,
125+
"Function missing return type: add return type annotation",
126+
),
127+
TrainingSample::with_fix(
128+
"missing `return` keyword in if branch",
129+
ErrorCategory::SyntaxError,
130+
"If branches need explicit return when not final expression",
131+
),
132+
TrainingSample::with_fix(
133+
"error[E0308]: `if` missing an `else` clause",
134+
ErrorCategory::TypeMismatch,
135+
"If expression needs else clause for type inference",
136+
),
137+
]);
138+
}
139+
140+
fn add_file_io_samples(dataset: &mut TrainingDataset) {
141+
dataset.add_many(vec![
142+
TrainingSample::with_fix(
143+
"error[E0308]: expected `&mut [u8]`, found integer",
144+
ErrorCategory::TypeMismatch,
145+
"Python f.read(8192) → Rust requires buffer: let mut buf = vec![0u8; 8192]",
146+
),
147+
TrainingSample::with_fix(
148+
"error[E0599]: no method named `hexdigest` found for struct `String`",
149+
ErrorCategory::TraitBound,
150+
"hashlib.hexdigest() → use sha2/md5 crate with .finalize() and hex encoding",
151+
),
152+
TrainingSample::with_fix(
153+
"error[E0599]: no method named `update` found for struct `String`",
154+
ErrorCategory::TraitBound,
155+
"hasher.update(chunk) → use Digest trait from sha2 crate",
156+
),
157+
TrainingSample::with_fix(
158+
"error[E0599]: no method named `is_empty` found for enum `Result`",
159+
ErrorCategory::TypeMismatch,
160+
"Walrus operator pattern: while chunk := f.read() needs different Rust idiom",
161+
),
162+
]);
163+
}
164+
165+
fn add_type_inference_samples(dataset: &mut TrainingDataset) {
166+
dataset.add_many(vec![
167+
TrainingSample::with_fix(
168+
"error[E0606]: casting `&serde_json::Value` as `i64` is invalid",
169+
ErrorCategory::TypeMismatch,
170+
"Parameter type should be f64 not Value - infer from cast usage",
171+
),
172+
TrainingSample::with_fix(
173+
"error[E0308]: expected `f64`, found `&serde_json::Value`",
174+
ErrorCategory::TypeMismatch,
175+
"Numeric parameter defaulted to Value - propagate type from arithmetic",
176+
),
177+
TrainingSample::with_fix(
178+
"error[E0599]: no method named `to_uppercase` found for enum `serde_json::Value`",
179+
ErrorCategory::TypeMismatch,
180+
"String method on Value - parameter should be String not Value",
181+
),
182+
TrainingSample::with_fix(
183+
"error[E0599]: no method named `len` found for reference `&serde_json::Value`",
184+
ErrorCategory::TypeMismatch,
185+
"Collection method on Value - infer Vec/String from .len() usage",
186+
),
187+
TrainingSample::with_fix(
188+
"error[E0599]: the method `join` exists but trait bounds not satisfied",
189+
ErrorCategory::TraitBound,
190+
"Vec<Value> should be Vec<String> for join() - propagate element type",
191+
),
192+
TrainingSample::with_fix(
193+
"error[E0282]: type annotations needed",
194+
ErrorCategory::TypeMismatch,
195+
"Insufficient type context - add explicit annotation or infer from usage",
196+
),
197+
]);
198+
}
199+
200+
/// Get error-fix pairs formatted for NgramFixPredictor training.
201+
#[must_use]
202+
pub fn get_training_pairs() -> Vec<(String, String, ErrorCategory)> {
203+
build_depyler_corpus().error_fix_pairs()
204+
}
205+
206+
/// Category distribution for depyler corpus.
207+
#[must_use]
208+
pub fn corpus_stats() -> Vec<(ErrorCategory, usize)> {
209+
let dataset = build_depyler_corpus();
210+
vec![
211+
(ErrorCategory::TypeMismatch, dataset.samples_for_category(ErrorCategory::TypeMismatch).len()),
212+
(ErrorCategory::TraitBound, dataset.samples_for_category(ErrorCategory::TraitBound).len()),
213+
(ErrorCategory::MissingImport, dataset.samples_for_category(ErrorCategory::MissingImport).len()),
214+
(ErrorCategory::SyntaxError, dataset.samples_for_category(ErrorCategory::SyntaxError).len()),
215+
]
216+
}
217+
218+
#[cfg(test)]
219+
mod tests {
220+
use super::*;
221+
222+
#[test]
223+
fn test_depyler_corpus_not_empty() {
224+
let corpus = build_depyler_corpus();
225+
assert!(corpus.len() >= 20, "Corpus should have at least 20 samples");
226+
}
227+
228+
#[test]
229+
fn test_all_samples_have_fixes() {
230+
let corpus = build_depyler_corpus();
231+
let pairs = corpus.error_fix_pairs();
232+
assert_eq!(pairs.len(), corpus.len(), "All samples should have fixes");
233+
}
234+
235+
#[test]
236+
fn test_category_distribution() {
237+
let stats = corpus_stats();
238+
let total: usize = stats.iter().map(|(_, c)| c).sum();
239+
assert!(total >= 20);
240+
241+
// TypeMismatch should be the largest category (our main issue)
242+
let type_mismatch_count = stats.iter()
243+
.find(|(cat, _)| *cat == ErrorCategory::TypeMismatch)
244+
.map(|(_, c)| *c)
245+
.unwrap_or(0);
246+
assert!(type_mismatch_count >= 8, "TypeMismatch should have most samples");
247+
}
248+
249+
#[test]
250+
fn test_training_pairs_format() {
251+
let pairs = get_training_pairs();
252+
for (error, fix, _category) in &pairs {
253+
assert!(!error.is_empty(), "Error should not be empty");
254+
assert!(!fix.is_empty(), "Fix should not be empty");
255+
}
256+
}
257+
}

crates/depyler-oracle/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ use aprender::tree::DecisionTreeClassifier;
1515
use serde::{Deserialize, Serialize};
1616

1717
pub mod classifier;
18+
pub mod depyler_training;
1819
pub mod features;
1920
pub mod ngram;
2021
pub mod patterns;

0 commit comments

Comments
 (0)