Machine-Learning-Project/app.py at main · RachelB9913/Machine-Learning-Project · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
import json
import gradio as gr
import joblib
import pandas as pd
from related_topics_prediction import MultiLabelThresholdOptimizer


def convert_to_float(value):
    if 'K' in value:
        return float(value.replace('K', '')) * 1_000
    elif 'M' in value:
        return float(value.replace('M', '')) * 1_000_000
    return float(value)  # If it's already a number


def convert_to_string(value):
    if value >= 1_000_000:
        return f"{value / 1_000_000:.1f}M"
    elif value >= 1_000:
        return f"{value / 1_000:.1f}K"
    return str(int(value))  # Keep it as an integer if it's below 1,000


def greet(title, description, difficulty, topics, likes, accepted, submission, comments, is_premium, predict):

    x_new = pd.DataFrame([{
        'id': 1,
        'title': str(title),
        'description': str(description),
        'is_premium': 1 if is_premium == "premium" else 0,
        'difficulty': 0 if difficulty == "Easy" else 1 if difficulty == "Hard" else 2,
        'acceptance_rate': convert_to_float(accepted)/convert_to_float(submission),
        'frequency': 0,
        'discuss_count': float(comments),
        'accepted': convert_to_float(accepted),
        'submissions': convert_to_float(submission),
        'companies': [""],
        'related_topics': topics.split(',') if isinstance(topics, str) else topics,
        'likes': convert_to_float(likes),
        'dislikes': 0,
        'rating': convert_to_float(likes) / (convert_to_float(likes) + 0),
        'asked_by_faang': 0,
        'similar_questions': ""
    }])

    # Efficient Multi-Hot Encoding for Companies
    company_data = {company: 1 if company in x_new["companies"].iloc[0] else 0 for company in companies_columns}
    x_new = pd.concat([x_new, pd.DataFrame([company_data])], axis=1)

    x_new = x_new.drop(columns=["companies"])  # Drop original column

    # Efficient Multi-Hot Encoding for Topics
    topic_data = {topic: 1 if topic in x_new["related_topics"].iloc[0] else 0 for topic in the_topics}
    x_new = pd.concat([x_new, pd.DataFrame([topic_data])], axis=1)

    x_new = x_new.drop(columns=["related_topics"])  # Drop original topics column

    # Label encode 'title'
    title_model = joblib.load("title_encoder.pkl")
    x_new['title'] = title_model.fit_transform(x_new['title'])

    if predict == "related topics":
        vectorizer = joblib.load("related_topics_vectorizer.pkl")

        new_tfidf = vectorizer.transform(x_new["description"])

        best_model_info = joblib.load('best_model_related_topics_info.pkl')
        best_model = joblib.load("best_related_topics_model.pkl")
        optimizer = MultiLabelThresholdOptimizer()
        optimizer.optimal_thresholds[best_model_info['model_name']] = best_model_info['threshold']

        predictions = optimizer.predict(best_model, new_tfidf, best_model_info['model_name'])

        mlb = joblib.load("related_topics_label_binarizer.pkl")
        predictions = mlb.inverse_transform(predictions)

        ans = f"the related topics are: {', '.join(map(str, predictions[0]))}"
        return ans

    else:
        vectorizer = joblib.load("tfidf_vectorizer.pkl")

        new_tfidf = vectorizer.transform(x_new["description"])

        # Convert to DataFrame
        new_tfidf_df = pd.DataFrame(new_tfidf.toarray(), columns=vectorizer.get_feature_names_out())
        x_new = pd.concat([x_new, new_tfidf_df], axis=1)
        x_new = x_new.drop(columns=['description'])

        if predict == "difficulty level":
            # load the dislike model because there is no dislike in the input
            dislikes_model, feature_names = joblib.load("dislikes_XGB_regression_model.pkl")

            x_new_filtered = x_new[feature_names]  # Select only the required features
            dislike = dislikes_model.predict(x_new_filtered)
            x_new['dislikes'] = dislike[0]
            x_new['rating']: convert_to_float(likes) / (convert_to_float(likes) + dislike[0])

            # Load the model
            class_model = joblib.load("level_classifier_model.pkl")

            # Get feature names from trained model
            trained_feature_names = class_model.named_steps['standardscaler'].get_feature_names_out()

            x_new = x_new[trained_feature_names]  # Reorder and remove extra columns

            # Fill missing columns with 0 (or a suitable default)
            for col in trained_feature_names:
                if col not in x_new:
                    x_new[col] = 0  # or another default value

            x_new = x_new[trained_feature_names]  # Ensure correct order again

            predictions = class_model.predict(x_new)

            if predictions == 1:
                prediction = "Hard"
            elif predictions == 0:
                prediction = "Easy"
            elif predictions == 2:
                prediction = "Medium"

            ans = f"the level difficulty is: {prediction}"
            return ans

        elif predict == "acceptance":
            # Load the model
            accepted_submissions_model, feature_names = joblib.load("accepted_submissions_regression_model.pkl")

            # Assuming `X_new` is a DataFrame with extra features
            x_new_filtered = x_new[feature_names]  # Select only the required features

            predictions = accepted_submissions_model.predict(x_new_filtered)

            ans = f"the accepted is: {convert_to_string(predictions[0])}"
            return ans

        elif predict == "number of likes":
            # Load the model
            likes_model, feature_names = joblib.load("likes_random_forest_regression_model.pkl")

            # Assuming `X_new` is a DataFrame with extra features
            x_new_filtered = x_new[feature_names]  # Select only the required features

            predictions = likes_model.predict(x_new_filtered)

            ans = f"the likes amount is: {convert_to_string(predictions[0])}"
            return ans

        elif predict == "number of dislikes":
            # Load the model
            dislikes_model, feature_names = joblib.load("dislikes_XGB_regression_model.pkl")

            # Assuming `x_new` is a DataFrame with extra features
            x_new_filtered = x_new[feature_names]  # Select only the required features

            predictions = dislikes_model.predict(x_new_filtered)

            ans = f"the dislikes amount is: {convert_to_string(predictions[0])}"
            return ans


with open("encoding_metadata.json", "r") as f:
    encoding_metadata = json.load(f)

the_topics = encoding_metadata["related_topics_columns"]
the_topics.remove("")
companies_columns = encoding_metadata["companies_columns"]
companies_columns.remove("")

demo = gr.Interface(
    fn=greet,
    inputs=[gr.Text(label="Title"), gr.Text(label="Description"),
            gr.Radio(choices=["Easy", "Medium", "Hard"], label="Difficulty Level"),
            gr.Dropdown(the_topics, multiselect=True, label="Related Topics",
                        info="choose all the related topics of this question"),
            gr.Text(label="Likes Amount"),
            gr.Text(label="Accepted Amount"),
            gr.Text(label="Submission Amount"),
            gr.Text(label="Comments Amount"),
            gr.Radio(choices=["premium", "not premium"], label="Is Premium"),
            gr.Radio(choices=["acceptance", "difficulty level", "number of likes", "number of dislikes",
                              "related topics"], label="Please Predict..")
            ],
    outputs=[gr.Text(label="The Prediction")],
    title="LEETCODE PREDICTOR",
    description="please go to the leetcode website (https://leetcode.com/problemset//) choose a question and copy the question's detiles to the relevant spaces, then choose what you whould like to predict and submit. the prediction result will appear on the right side of the screen 😉"
)

demo.launch()