-
Notifications
You must be signed in to change notification settings - Fork 8
Expand file tree
/
Copy pathmain_filtering.py
More file actions
94 lines (77 loc) · 4.08 KB
/
main_filtering.py
File metadata and controls
94 lines (77 loc) · 4.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
from dotenv import load_dotenv
from openai import OpenAI
import time
from repofinder.filtering.ai_classifier import compute_ai_predictions
from repofinder.filtering.ai_type_classifier import compute_ai_type_predictions
from repofinder.filtering.filter_utils import update_predictions_in_db
from repofinder.filtering.manual_labels import update_manual_labels
from repofinder.filtering.ml_classifier import train_models
from repofinder.filtering.roc_multiple import create_roc_curves
from repofinder.filtering.score_based_classifier import compute_predictions_sbc
from repofinder.filtering.type_classifier_accuracy import type_classifier_accuracy
DOTENV = ".env"
load_dotenv(DOTENV)
OPENAI_TOKEN = os.getenv("OPENAI_TOKEN")
client = OpenAI(api_key=OPENAI_TOKEN)
def filter_repos(university_acronyms = ['UCSB', 'UCSC', 'UCSD']): # Only 'UCSB', 'UCSC', 'UCSD' have manual labels for now
for acronym in university_acronyms:
config_file= f"config/config_{acronym}.json"
db_file = f"Data/db/repository_data_{acronym}_database.db"
manual_labels = f"Data/manual_labels/{acronym}_Random200.csv"
test_set = f"Data/test_data/test_set_{acronym}.csv"
type_test_set = f"Data/test_data/type_test_set_{acronym}.csv"
# Affiliation Classification
sbc_predictions = compute_predictions_sbc(acronym, config_file, db_file, subset=test_set)
update_manual_labels(manual_labels, db_file) # Needed for supervised ML models
ml_university_matrix_predictions = train_models(acronym, config_file, db_file, method="matrix", build_matrix=True)
ml_university_embeddings_predictions = train_models(acronym, config_file, db_file, method="embeddings", build_matrix=True, client=client, subset=test_set)
ai_predictions_35 = compute_ai_predictions(
acronym, config_file, db_file, client,
model="gpt-3.5-turbo",
# subset=test_set, # Use test_set to only run classification on a subset
truncation_type="start_end",
truncate=20000,
start_length=15000,
end_length=5000
)
ai_predictions_4o = compute_ai_predictions(
acronym, config_file, db_file, client,
model="gpt-4o",
# subset=test_set, # Use test_set to only run classification on a subset
truncation_type="start_end",
truncate=20000,
start_length=15000,
end_length=5000
)
ai_predictions_5 = compute_ai_predictions(
acronym, config_file, db_file, client,
model="gpt-5-mini",
# subset=test_set, # Use test_set to only run classification on a subset
truncation_type="start_end",
truncate=20000,
start_length=15000,
end_length=5000
)
# ai_predictions_5 = f"results/{acronym}/predictions_ai_gpt-5-mini_{acronym}.csv" --> Unused, but kept for reference
update_predictions_in_db(ai_predictions_5, db_file, "affiliation_prediction_gpt_5_mini")
#Project Type classification (only for gpt-5-mini)
ai_type_predictions_5 = compute_ai_type_predictions(
acronym, config_file, db_file, client,
model="gpt-5-mini",
#subset=type_test_set, # Use test_set to only run classification on a subset
truncation_type="start_end",
truncate=20000,
start_length=15000,
end_length=5000,
affiliated_only=True,
affiliation_threshold=0.4
)
# ai_type_predictions_5 = f"results/{acronym}/repo_type_gpt-5-mini_{acronym}_subset.csv" --> Unused, but kept for reference
update_predictions_in_db(ai_type_predictions_5, db_file, "type_prediction_gpt_5_mini")
type_classifier_accuracy(f"Data/test_data/type_test_set_{acronym}.csv", ai_type_predictions_5, acronym)
create_roc_curves(university_acronyms, curves_to_plot=['SBC', 'embeddings','gpt-35-turbo', 'gpt-4o', 'gpt-5-mini'])
if __name__ == "__main__":
filter_repos()