Overview

We did a total of three different models that all are able to make predictions to different levels of success.

model 1

This model is able to successfully make predictions. Some of the predictions are quite close but there is a bit of unpredictability in it so far. This model is predicting the WAR baseball stat which shows the Wins Above Replacement which means how helpful they are to their team.

import pandas as pd
                                from xgboost import
                                    XGBRegressor
                                from sklearn.model_selection import
                                    train_test_split
                                from sklearn.preprocessing import
                                    LabelEncoder
                                
                                # Load the CSV
                                df = pd.read_csv('baseball_stats.csv',
                                    encoding='utf-8')
                                
                                # Drop non-numeric or irrelevant columns (like names or awards for
                                        now)
                                df = df.drop(['Rk', 'Player', 'Team', 'Lg', 'Pos', 'Awards'], axis=1, errors='ignore')
                                
                                # Optional: Encode any remaining non-numeric columns
                                for col in df.columns:
                                 if df[col].dtype == 'object':
                                 df[col] = LabelEncoder().fit_transform(df[col])
                                
                                # Drop rows with missing values (simplest strategy)
                                df = df.dropna()
                                
                                # Separate features and target
                                X = df.drop('WAR', axis=1)
                                y = df['WAR']
                                
                                # Split data
                                X_train,
                                    X_test, y_train, y_test = train_test_split(X, y,
                                    test_size=0.2, random_state=42)
                                
                                # Create and train model
                                model = XGBRegressor()
                                model.fit(X_train, y_train)
                                
                                # Predict
                                predictions
                                    = model.predict(X_test)
                                for i, prediction in enumerate(predictions):
                                 print(f"Prediction {i + 1}: {prediction:.2f}")
                                 print(f"Actual {i + 1}: {y_test.iloc[i]:.2f}")

Prediction 1: 0.48
                                Actual 1: 0.90
                                Prediction 2: 2.14
                                Actual 2: 3.30
                                Prediction 3: -0.41
                                Actual 3: -1.40
                                Prediction 4: 2.63
                                Actual 4: 2.60
                                Prediction 5: 0.75
                                Actual 5: 0.10
                                Prediction 6: 0.48
                                Actual 6: 0.20
                                Prediction 7: -0.34
                                Actual 7: -1.20
                                Prediction 8: 1.32
                                Actual 8: 0.40
                                Prediction 9: 2.28
                                Actual 9: 1.60
                                Prediction 10: -0.06
                                Actual 10: -0.10
                                Prediction 11: 3.19
                                Actual 11: 2.60
                                Prediction 12: 1.83
                                Actual 12: 0.80
                                Prediction 13: 0.49
                                Actual 13: 0.20
                                Prediction 14: 3.63
                                Actual 14: 4.00
                                Prediction 15: 1.74
                                Actual 15: 0.80
                                Prediction 16: 1.63
                                Actual 16: 1.70
                                Prediction 17: -0.41
                                Actual 17: 0.20
                                Prediction 18: 3.48
                                Actual 18: 0.80
                                Prediction 19: -0.40
                                Actual 19: -0.40
                                Prediction 20: -0.13
                                Actual 20: 1.40
                                Prediction 21: 0.11
                                Actual 21: 0.00
                                Prediction 22: 0.12
                                Actual 22: 0.00
                                Prediction 23: -0.54
                                Actual 23: -0.50
                                Prediction 24: -1.60
                                Actual 24: -1.10
                                Prediction 25: 5.48
                                Actual 25: 5.60
                                Prediction 26: -0.12
                                Actual 26: -0.80
                                Prediction 27: 0.27
                                Actual 27: 0.20
                                Prediction 28: 0.21
                                Actual 28: 1.30
                                Prediction 29: 1.16
                                Actual 29: 0.50
                                Prediction 30: -0.08
                                Actual 30: -0.10
                                Prediction 31: 0.05
                                Actual 31: 0.60
                                Prediction 32: 0.21
                                Actual 32: -0.10
                                Prediction 33: 0.96
                                Actual 33: 0.00
                                Prediction 34: 1.21
                                Actual 34: 0.00
                                Prediction 35: 8.13
                                Actual 35: 9.10
                                Prediction 36: 1.93
                                Actual 36: 0.80
                                Prediction 37: 1.70
                                Actual 37: 0.60
                                Prediction 38: 3.19
                                Actual 38: 2.70
                                Prediction 39: 2.52
                                Actual 39: 3.40
                                Prediction 40: 1.88
                                Actual 40: 4.70
                                Prediction 41: -0.60
                                Actual 41: 0.20
                                Prediction 42: 2.68
                                Actual 42: 1.20
                                Prediction 43: -0.64
                                Actual 43: -0.40
                                Prediction 44: 2.02
                                Actual 44: 2.10
                                Prediction 45: 2.53
                                Actual 45: 1.50
                                Prediction 46: 1.23
                                Actual 46: 1.10
                                Prediction 47: 6.23
                                Actual 47: 8.70
                                Prediction 48: 3.83
                                Actual 48: 3.60
                                Prediction 49: -1.16
                                Actual 49: -0.30
                                Prediction 50: 3.47
                                Actual 50: 1.00
                                Prediction 51: 0.89
                                Actual 51: 1.50
                                Prediction 52: 2.45
                                Actual 52: 2.60
                                Prediction 53: 3.41
                                Actual 53: 5.00
                                Prediction 54: 1.09
                                Actual 54: 0.60
                                Prediction 55: 0.55
                                Actual 55: 1.00
                                Prediction 56: 4.45
                                Actual 56: 4.90
                                Prediction 57: 2.04
                                Actual 57: 2.70
                                Prediction 58: 2.99
                                Actual 58: 3.40
                                Prediction 59: 0.08
                                Actual 59: -0.90
                                Prediction 60: 2.89
                                Actual 60: 4.30
                                Prediction 61: -0.14
                                Actual 61: 0.30
                                Prediction 62: 2.10
                                Actual 62: 0.70
                                Prediction 63: 0.02
                                Actual 63: 0.30
                                Prediction 64: -0.51
                                Actual 64: -0.40
                                Prediction 65: 1.46
                                Actual 65: -0.60
                                Prediction 66: 1.71
                                Actual 66: 1.70
                                Prediction 67: 0.24
                                Actual 67: 0.90
                                Prediction 68: 3.97
                                Actual 68: 3.70
                                Prediction 69: 1.18
                                Actual 69: 0.30
                                Prediction 70: 0.61
                                Actual 70: 0.30
                                Prediction 71: 2.72
                                Actual 71: 3.10
                                Prediction 72: -0.21
                                Actual 72: -0.70
                                Prediction 73: 0.50
                                Actual 73: 0.10
                                Prediction 74: -0.35
                                Actual 74: -0.90
                                Prediction 75: 0.89
                                Actual 75: 2.60
                                Prediction 76: 0.94
                                Actual 76: 0.70
                                Prediction 77: 1.32
                                Actual 77: 0.70
                                Prediction 78: 1.88
                                Actual 78: 0.80
                                Prediction 79: -0.57
                                Actual 79: -0.70
                                Prediction 80: -0.24
                                Actual 80: -1.10
                                Prediction 81: 1.14
                                Actual 81: 0.60
                                Prediction 82: 2.92
                                Actual 82: 1.30
                                Prediction 83: -0.28
                                Actual 83: -0.20
                                Prediction 84: -0.09
                                Actual 84: 0.20
                                Prediction 85: -0.01
                                Actual 85: 0.40
                                Prediction 86: 2.23
                                Actual 86: 5.10
                                Prediction 87: 0.84
                                Actual 87: 1.60
                                Prediction 88: -1.15
                                Actual 88: -1.50
                                Prediction 89: 0.09
                                Actual 89: 0.10
                                Prediction 90: 1.90
                                Actual 90: 0.60
                                Prediction 91: 1.74
                                Actual 91: 1.40
                                Prediction 92: -0.10
                                Actual 92: 0.10
                                Prediction 93: -0.41
                                Actual 93: 0.10
                                Prediction 94: -0.12
                                Actual 94: -0.50
                                Prediction 95: -0.61
                                Actual 95: 0.40
                                Prediction 96: 1.62
                                Actual 96: 2.20
                                Prediction 97: -0.06
                                Actual 97: -0.80
                                Prediction 98: 0.90
                                Actual 98: 0.20
                                Prediction 99: -0.28
                                Actual 99: -0.20
                                Prediction 100: 3.26
                                Actual 100: 3.10

model 2

This model predicts the stats that Shohei Ohtani will would have for his next game which came out to be actually 2 AB, 2 R, 1 H, O HR, 0.299 AVG.

import pandas as pd
                                import xgboost as xgb
                                import math
                                from sklearn.metrics import
                                    mean_squared_error, r2_score
                                
                                
                                # 1. Load the data
                                df = pd.read_csv("Shohei Ohtani Last
                                        Season.csv")
                                
                                # 2. Drop non-numeric or non-useful columns
                                # Adjust as necessary based on your data
                                df = df.drop(columns=['Date', 'OPP', 'Team'], errors='ignore')
                                
                                
                                
                                # 3. Set target and features
                                target_stats
                                    = ['AB','R','H','HR','AVG'] # AtBat,
                                        Runs, Hits, HomeRuns, AverageBattingScore
                                
                                
                                # Clean data 
                                features = df.drop(columns=target_stats,
                                    errors='ignore')
                                features = pd.get_dummies(features).fillna(0)
                                
                                # Create an Input for the next game
                                next_game_features =
                                    features.mean().to_frame().T
                                
                                # Ensure the next game input matches the columns
                                next_game_features =
                                    next_game_features[features.columns]
                                
                                # Run Calculations on Target Stats
                                print("Shohei Ohtani's Next Game
                                        Results\n")
                                print(f"{'Stat':<10}
                                    {'Prediction':>10}")
                                print('-' *
                                    21)
                                for stat in target_stats:
                                 if stat not in
                                    df.columns:
                                 print(f"X Stat '{stat}' not found.
                                        Skipping.")
                                 continue
                                 
                                 # Drop Rows where the stat is missing
                                 stat_df = df[df[stat].notna()]
                                 y = stat_df[stat]
                                 X = features.loc[stat_df.index]
                                 
                                 # 4. Train XGBoost regressor
                                 model = xgb.XGBRegressor(objective='reg:squarederror')
                                 model.fit(X,
                                    y) 
                                 
                                 # 5. Predict Hits in the next game
                                 prediction
                                    = model.predict(next_game_features)[0]
                                
                                 # 6. Print results
                                 print(f"{stat:<10}
                                    {prediction:.2f}")
                                
                                # Evaluate model on training data
                                train_preds
                                    = model.predict(X)
                                mse = mean_squared_error(y, train_preds)
                                r2 = r2_score(y, train_preds)
                                print(f"Training MSE: {mse:.2f}, R-Squared:
                                    {r2:.2f}")

Shohei Ohtani's Next Game Results

                                Stat Prediction
                                ---------------------
                                AB 4.96
                                R 0.29
                                H 1.83
                                HR 0.00
                                AVG 0.32
                                Training MSE: 0.00, R-Squared: 1.00

model 3

This model makes very good predictions which are usually very close to the actual number. This model is trying to predict the BA or the Batting Average for players overall stats for the year based on all of the other measurables for the year.

import pandas as pd
                                from sklearn.model_selection import
                                    train_test_split
                                import xgboost as xgb
                                # Import relevant metrics
                                from sklearn.metrics import
                                    mean_squared_error, r2_score, accuracy_score, classification_report
                                
                                # 1. Load Data
                                try:
                                 df = pd.read_csv('baseball_stats.csv',
                                    encoding='windows-1252')
                                 df = df.drop(columns=['Player', 'Team', 'Lg', 'WAR', 'Pos', 'Awards'])
                                except FileNotFoundError:
                                 print("Error: CSV file not
                                        found.")
                                 exit()
                                
                                target_column_index = 15
                                
                                # 2. Identify Features (X) and Target (y)
                                if target_column_index >= len(df.columns):
                                 print(f"Error: Column at index {target_column_index}
                                        does not exist.")
                                 exit()
                                
                                y = df.iloc[:, target_column_index] # Target
                                        variable (the 19th column)
                                # All other columns as features
                                X = df.drop(df.columns[target_column_index], axis=1)
                                
                                # 3. Split Data into Training and Testing Sets
                                X_train,
                                    X_test, y_train, y_test = train_test_split(
                                 # You can adjust test_size and random_state
                                 X, y,
                                    test_size=0.2, random_state=42)
                                
                                # 4. Initialize and Train the XGBoost Model
                                # Determine if it's a regression or classification task based on the
                                        target variable
                                if pd.api.types.is_numeric_dtype(y):
                                 # Regression task
                                 # Or other regression objectives
                                 model = xgb.XGBRegressor(objective='reg:squarederror')
                                
                                    model.fit(X_train, y_train)
                                else:
                                 # Classification task
                                 # You might need to handle class imbalances or multi-class
                                        scenarios
                                 model = xgb.XGBClassifier(objective='multi:softmax' if y.nunique(
                                 ) > 2 else
                                    'binary:logistic') # Or other
                                        classification objectives
                                
                                    model.fit(X_train, y_train)
                                
                                # 4.5 Save the model
                                model.save_model("initial_model.json")
                                
                                # 5. Make Predictions on the Test Set
                                predictions
                                    = model.predict(X_test)
                                for i, prediction in enumerate(predictions):
                                 print(f"Prediction {i + 1}: {prediction:.2f}")
                                 print(f"Actual {i + 1}: {y_test.iloc[i]:.2f}")
                                
                                # 6. Evaluate the Model
                                print("\n--- Model Evaluation ---")
                                if pd.api.types.is_numeric_dtype(y):
                                 # Regression metrics
                                 mse = mean_squared_error(y_test, predictions)
                                 r2 = r2_score(y_test, predictions)
                                 print(f"Mean Squared Error: {mse:.4f}")
                                 print(f"R-squared: {r2:.4f}")
                                else:
                                 # Classification metrics
                                 accuracy = accuracy_score(y_test, predictions)
                                 report = classification_report(y_test, predictions)
                                 print(f"Accuracy: {accuracy:.4f}")
                                 print("Classification Report:\n", report)

Prediction 1: 0.20
                                Actual 1: 0.20
                                Prediction 2: 0.29
                                Actual 2: 0.30
                                Prediction 3: 0.20
                                Actual 3: 0.20
                                Prediction 4: 0.23
                                Actual 4: 0.23
                                Prediction 5: 0.27
                                Actual 5: 0.24
                                Prediction 6: 0.26
                                Actual 6: 0.26
                                Prediction 7: 0.21
                                Actual 7: 0.21
                                Prediction 8: 0.25
                                Actual 8: 0.26
                                Prediction 9: 0.28
                                Actual 9: 0.28
                                Prediction 10: 0.17
                                Actual 10: 0.20
                                Prediction 11: 0.25
                                Actual 11: 0.24
                                Prediction 12: 0.22
                                Actual 12: 0.22
                                Prediction 13: 0.26
                                Actual 13: 0.24
                                Prediction 14: 0.26
                                Actual 14: 0.26
                                Prediction 15: 0.27
                                Actual 15: 0.28
                                Prediction 16: 0.27
                                Actual 16: 0.30
                                Prediction 17: 0.19
                                Actual 17: 0.19
                                Prediction 18: 0.25
                                Actual 18: 0.25
                                Prediction 19: 0.17
                                Actual 19: 0.15
                                Prediction 20: 0.23
                                Actual 20: 0.24
                                Prediction 21: 0.23
                                Actual 21: 0.23
                                Prediction 22: 0.28
                                Actual 22: 0.26
                                Prediction 23: 0.23
                                Actual 23: 0.24
                                Prediction 24: 0.19
                                Actual 24: 0.18
                                Prediction 25: 0.27
                                Actual 25: 0.29
                                Prediction 26: 0.20
                                Actual 26: 0.18
                                Prediction 27: 0.25
                                Actual 27: 0.27
                                Prediction 28: 0.23
                                Actual 28: 0.22
                                Prediction 29: 0.26
                                Actual 29: 0.25
                                Prediction 30: 0.24
                                Actual 30: 0.23
                                Prediction 31: 0.20
                                Actual 31: 0.20
                                Prediction 32: 0.20
                                Actual 32: 0.19
                                Prediction 33: 0.26
                                Actual 33: 0.25
                                Prediction 34: 0.25
                                Actual 34: 0.27
                                Prediction 35: 0.28
                                Actual 35: 0.28
                                Prediction 36: 0.27
                                Actual 36: 0.28
                                Prediction 37: 0.25
                                Actual 37: 0.24
                                Prediction 38: 0.26
                                Actual 38: 0.26
                                Prediction 39: 0.26
                                Actual 39: 0.25
                                Prediction 40: 0.24
                                Actual 40: 0.22
                                Prediction 41: 0.21
                                Actual 41: 0.20
                                Prediction 42: 0.27
                                Actual 42: 0.27
                                Prediction 43: 0.20
                                Actual 43: 0.17
                                Prediction 44: 0.24
                                Actual 44: 0.23
                                Prediction 45: 0.23
                                Actual 45: 0.23
                                Prediction 46: 0.23
                                Actual 46: 0.22
                                Prediction 47: 0.28
                                Actual 47: 0.28
                                Prediction 48: 0.26
                                Actual 48: 0.28
                                Prediction 49: 0.20
                                Actual 49: 0.21
                                Prediction 50: 0.29
                                Actual 50: 0.31
                                Prediction 51: 0.28
                                Actual 51: 0.27
                                Prediction 52: 0.25
                                Actual 52: 0.23
                                Prediction 53: 0.27
                                Actual 53: 0.28
                                Prediction 54: 0.23
                                Actual 54: 0.23
                                Prediction 55: 0.26
                                Actual 55: 0.27
                                Prediction 56: 0.26
                                Actual 56: 0.27
                                Prediction 57: 0.25
                                Actual 57: 0.24
                                Prediction 58: 0.24
                                Actual 58: 0.23
                                Prediction 59: 0.22
                                Actual 59: 0.21
                                Prediction 60: 0.27
                                Actual 60: 0.27
                                Prediction 61: 0.23
                                Actual 61: 0.23
                                Prediction 62: 0.27
                                Actual 62: 0.27
                                Prediction 63: 0.23
                                Actual 63: 0.24
                                Prediction 64: 0.17
                                Actual 64: 0.18
                                Prediction 65: 0.21
                                Actual 65: 0.20
                                Prediction 66: 0.24
                                Actual 66: 0.22
                                Prediction 67: 0.27
                                Actual 67: 0.23
                                Prediction 68: 0.28
                                Actual 68: 0.27
                                Prediction 69: 0.23
                                Actual 69: 0.23
                                Prediction 70: 0.26
                                Actual 70: 0.22
                                Prediction 71: 0.33
                                Actual 71: 0.33
                                Prediction 72: 0.21
                                Actual 72: 0.21
                                Prediction 73: 0.25
                                Actual 73: 0.19
                                Prediction 74: 0.23
                                Actual 74: 0.24
                                Prediction 75: 0.25
                                Actual 75: 0.25
                                Prediction 76: 0.22
                                Actual 76: 0.23
                                Prediction 77: 0.24
                                Actual 77: 0.24
                                Prediction 78: 0.22
                                Actual 78: 0.23
                                Prediction 79: 0.20
                                Actual 79: 0.19
                                Prediction 80: 0.22
                                Actual 80: 0.21
                                Prediction 81: 0.26
                                Actual 81: 0.26
                                Prediction 82: 0.25
                                Actual 82: 0.25
                                Prediction 83: 0.22
                                Actual 83: 0.22
                                Prediction 84: 0.25
                                Actual 84: 0.26
                                Prediction 85: 0.23
                                Actual 85: 0.23
                                Prediction 86: 0.26
                                Actual 86: 0.25
                                Prediction 87: 0.25
                                Actual 87: 0.24
                                Prediction 88: 0.18
                                Actual 88: 0.21
                                Prediction 89: 0.21
                                Actual 89: 0.19
                                Prediction 90: 0.25
                                Actual 90: 0.25
                                Prediction 91: 0.23
                                Actual 91: 0.22
                                Prediction 92: 0.21
                                Actual 92: 0.23
                                Prediction 93: 0.21
                                Actual 93: 0.20
                                Prediction 94: 0.23
                                Actual 94: 0.23
                                Prediction 95: 0.20
                                Actual 95: 0.20
                                Prediction 96: 0.23
                                Actual 96: 0.24
                                Prediction 97: 0.24
                                Actual 97: 0.24
                                Prediction 98: 0.24
                                Actual 98: 0.25
                                Prediction 99: 0.10
                                Actual 99: 0.20
                                Prediction 100: 0.26
                                Actual 100: 0.25

                                --- Model Evaluation ---
                                Mean Squared Error: 0.0003
                                R-squared: 0.7068

Cleaning the data

The rest of the sections show what we did in our research about cleaning data.

import pandas as pd
                                import re
                                
                                # Read in the CSV file
                                baseball_stats
                                    = pd.read_csv('baseball_stats.csv',
                                    encoding_errors ="ignore")
                                
                                # Display the first few rows of the dataframe
                                baseball_stats.head()

	Rk	Player	Age	Team	Lg	WAR	G	PA	AB	R	...	rOBA	Rbat+	TB	GIDP	HBP	SH	SF	IBB	Pos	Awards
0	1	Jarren Duran	27	BOS	AL	8.7	160	735	671	111	...	0.373	134	330	6	6	1	3	1	87	AS,MVP-8
1	2	Shohei Ohtani	29	LAD	NL	9.2	159	731	636	134	...	0.449	190	411	7	6	0	5	10	D	AS,MVP-1,SS
2	3	Gunnar Henderson	23	BAL	AL	9.1	159	719	630	118	...	0.385	157	333	2	7	0	4	1	6/D	AS,MVP-4
3	4	Marcus Semien	33	TEX	AL	4.1	159	718	650	101	...	0.310	100	254	9	3	0	1	2	4	AS
4	5	Juan Soto	25	NYY	AL	7.9	157	713	576	128	...	0.424	179	328	10	4	0	4	2	9/7DH	AS,MVP-3,SS

5 rows × 34 columns

Removing the team and league columns.

stats = baseball_stats.drop(columns =['Team', 'Lg'])
                                stats
                                # baseball_stats.query('Player == "Jos
                                        Tena*"')

	Rk	Player	Age	WAR	G	PA	AB	R	H	2B	...	rOBA	Rbat+	TB	GIDP	HBP	SH	SF	IBB	Pos	Awards
0	1	Jarren Duran	27	8.7	160	735	671	111	191	48	...	0.373	134	330	6	6	1	3	1	87	AS,MVP-8
1	2	Shohei Ohtani	29	9.2	159	731	636	134	197	38	...	0.449	190	411	7	6	0	5	10	D	AS,MVP-1,SS
2	3	Gunnar Henderson	23	9.1	159	719	630	118	177	31	...	0.385	157	333	2	7	0	4	1	6/D	AS,MVP-4
3	4	Marcus Semien	33	4.1	159	718	650	101	154	27	...	0.310	100	254	9	3	0	1	2	4	AS
4	5	Juan Soto	25	7.9	157	713	576	128	166	31	...	0.424	179	328	10	4	0	4	2	9/7DH	AS,MVP-3,SS
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
495	388	Nick Loftin	25	0.2	56	171	148	15	28	4	...	0.258	56	35	4	1	1	2	0	45H/3D71	NaN
496	389	angel Martinez	22	0.1	43	169	151	16	35	7	...	0.286	80	51	2	0	1	2	0	87/9H5D4	NaN
497	390	Jose Tena	23	0.2	44	168	161	14	43	5	...	0.290	81	57	1	0	0	0	0	5/4DH	NaN
498	390	Jose Tena	23	-0.1	3	4	4	0	0	0	...	0.000	-124	0	0	0	0	0	0	/5DH	NaN
499	390	Jose Tena	23	0.3	41	164	157	14	43	5	...	0.297	86	57	1	0	0	0	0	4-May	NaN

500 rows × 32 columns

Changing the position and awards columns into numbers Awards: MVP, GG, SS, AS GG, SS, AS are all in a boolean format. 0 if they didn’t get it and 1 if they did MVP is numbered by the number that comes after it, ex: MVP-9, MVP-12, etc.

stats['GG'] = stats['Awards'].str.contains('GG', na=False).astype(int)
                                stats['AS'] = stats['Awards'].str.contains('AS', na=False).astype(int)
                                stats['SS'] = stats['Awards'].str.contains('SS', na=False).astype(int)
                                
                                # Add MVP ranking column
                                def extract_mvp_rank(awards):
                                 if pd.isna(awards):
                                 return None
                                 match = re.search(r'MVP-(\d+)', awards)
                                 return int(match.group(1)) if match else None
                                
                                stats['MVP'] = stats['Awards'].apply(extract_mvp_rank)
                                
                                stats = stats.drop(columns=['Awards'])
                                stats

	Rk	Player	Age	WAR	G	PA	AB	R	H	2B	...	GIDP	HBP	SH	SF	IBB	Pos	GG	AS	SS	MVP
0	1	Jarren Duran	27	8.7	160	735	671	111	191	48	...	6	6	1	3	1	87	0	1	0	8.0
1	2	Shohei Ohtani	29	9.2	159	731	636	134	197	38	...	7	6	0	5	10	D	0	1	1	1.0
2	3	Gunnar Henderson	23	9.1	159	719	630	118	177	31	...	2	7	0	4	1	6/D	0	1	0	4.0
3	4	Marcus Semien	33	4.1	159	718	650	101	154	27	...	9	3	0	1	2	4	0	1	0	NaN
4	5	Juan Soto	25	7.9	157	713	576	128	166	31	...	10	4	0	4	2	9/7DH	0	1	1	3.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
495	388	Nick Loftin	25	0.2	56	171	148	15	28	4	...	4	1	1	2	0	45H/3D71	0	0	0	NaN
496	389	angel Martinez	22	0.1	43	169	151	16	35	7	...	2	0	1	2	0	87/9H5D4	0	0	0	NaN
497	390	Jose Tena	23	0.2	44	168	161	14	43	5	...	1	0	0	0	0	5/4DH	0	0	0	NaN
498	390	Jose Tena	23	-0.1	3	4	4	0	0	0	...	0	0	0	0	0	/5DH	0	0	0	NaN
499	390	Jose Tena	23	0.3	41	164	157	14	43	5	...	1	0	0	0	0	4-May	0	0	0	NaN

500 rows × 35 columns

changing the position column into numbers

position_map
                                    = {
                                 '1': 1, '2':
                                    2, '3': 3,
                                    '4': 4, '5':
                                    5,
                                 '6': 6, '7':
                                    7, '8': 8,
                                    '9': 9
                                }
                                
                                # Function to extract the primary numeric position
                                def extract_primary_position(pos_string):
                                 if pd.isna(pos_string):
                                 return None
                                 # Find all digits 1-9
                                 digits = re.findall(r'[1-9]', pos_string)
                                 if not digits:
                                 return None
                                 return position_map[digits[0]] # Return the first valid position number
                                
                                # Apply the function to the 'Pos' column
                                stats['Position'] = stats['Pos'].apply(extract_primary_position)
                                stats = stats.drop(columns=['Pos'])
                                stats

	Rk	Player	Age	WAR	G	PA	AB	R	H	2B	...	GIDP	HBP	SH	SF	IBB	GG	AS	SS	MVP	Position
0	1	Jarren Duran	27	8.7	160	735	671	111	191	48	...	6	6	1	3	1	0	1	0	8.0	8.0
1	2	Shohei Ohtani	29	9.2	159	731	636	134	197	38	...	7	6	0	5	10	0	1	1	1.0	NaN
2	3	Gunnar Henderson	23	9.1	159	719	630	118	177	31	...	2	7	0	4	1	0	1	0	4.0	6.0
3	4	Marcus Semien	33	4.1	159	718	650	101	154	27	...	9	3	0	1	2	0	1	0	NaN	4.0
4	5	Juan Soto	25	7.9	157	713	576	128	166	31	...	10	4	0	4	2	0	1	1	3.0	9.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
495	388	Nick Loftin	25	0.2	56	171	148	15	28	4	...	4	1	1	2	0	0	0	0	NaN	4.0
496	389	angel Martinez	22	0.1	43	169	151	16	35	7	...	2	0	1	2	0	0	0	0	NaN	8.0
497	390	Jose Tena	23	0.2	44	168	161	14	43	5	...	1	0	0	0	0	0	0	0	NaN	5.0
498	390	Jose Tena	23	-0.1	3	4	4	0	0	0	...	0	0	0	0	0	0	0	0	NaN	5.0
499	390	Jose Tena	23	0.3	41	164	157	14	43	5	...	1	0	0	0	0	0	0	0	NaN	4.0

500 rows × 35 columns

Eliminating duplicate player rows.

# Group by 'Player' and find the maximum value of the 'GG'
                                        column
                                max_g = stats.groupby('Player',
                                    as_index=False)['G'].max()
                                
                                # Merge the maximum 'GG' values back to the original DataFrame to
                                        filter rows
                                merged_data
                                    = stats.merge(max_g, on=['Player', 'G'])
                                
                                # Drop duplicates if necessary
                                merged_data
                                    = merged_data.drop_duplicates()
                                
                                # Query for a specific player (e.g., "Jos Tena*")
                                merged_data.query('Player == "Jose
                                        Tena"')

	Rk	Player	Age	WAR	G	PA	AB	R	H	2B	...	GIDP	HBP	SH	SF	IBB	GG	AS	SS	MVP	Position
389	390	Jose Tena	23	0.2	44	168	161	14	43	5	...	1	0	0	0	0	0	0	0	NaN	5.0

1 rows × 35 columns

Makes a new cleaner CSV file out of the cleaned data.

# Save the merged_data DataFrame to a CSV file
                                merged_data.to_csv('merged_baseball_stats.csv', index=False)