| import pandas as pd |
| import numpy as np |
| from sklearn.cluster import KMeans |
| from pandas.tseries.holiday import USFederalHolidayCalendar as calendar |
|
|
| def define_target(df): |
| """ |
| Creates the target variable 'IsViolent' based on crime category. |
| """ |
| violent_categories = [ |
| 'ASSAULT', 'ROBBERY', 'SEX OFFENSES FORCIBLE', 'KIDNAPPING', 'HOMICIDE', 'ARSON' |
| ] |
| |
| df['IsViolent'] = df['Category'].apply(lambda x: 1 if x in violent_categories else 0) |
| return df |
|
|
| def extract_temporal_features(df): |
| """ |
| Extracts temporal features from the 'Dates' column. |
| """ |
| df['Hour'] = df['Dates'].dt.hour |
| df['Day'] = df['Dates'].dt.day |
| df['Month'] = df['Dates'].dt.month |
| df['Year'] = df['Dates'].dt.year |
| df['DayOfWeek'] = df['Dates'].dt.dayofweek |
| |
| df['IsWeekend'] = df['DayOfWeek'].apply(lambda x: 1 if x >= 5 else 0) |
| |
| |
| cal = calendar() |
| holidays = cal.holidays(start=df['Dates'].min(), end=df['Dates'].max()) |
| df['IsHoliday'] = df['Dates'].dt.date.astype('datetime64[ns]').isin(holidays).astype(int) |
| |
| return df |
|
|
| def get_season(month): |
| if month in [12, 1, 2]: |
| return 'Winter' |
| elif month in [3, 4, 5]: |
| return 'Spring' |
| elif month in [6, 7, 8]: |
| return 'Summer' |
| else: |
| return 'Fall' |
|
|
| def extract_contextual_features(df): |
| """ |
| Extracts contextual features like Season. |
| """ |
| df['Season'] = df['Month'].apply(get_season) |
| return df |
|
|
| def extract_location_features(df, n_clusters=10, kmeans_model=None): |
| """ |
| Extracts location features including K-Means clusters for high-crime zones. |
| """ |
| if kmeans_model is None: |
| |
| kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10) |
| df['LocationCluster'] = kmeans.fit_predict(df[['X', 'Y']]) |
| return df, kmeans |
| else: |
| |
| df['LocationCluster'] = kmeans_model.predict(df[['X', 'Y']]) |
| return df, kmeans_model |
|
|
| def preprocess_pipeline(df, is_train=True, kmeans_model=None): |
| """ |
| Runs the full preprocessing pipeline. |
| """ |
| df = extract_temporal_features(df) |
| df = extract_contextual_features(df) |
| |
| |
| df, kmeans_model = extract_location_features(df, kmeans_model=kmeans_model) |
| |
| if is_train: |
| df = define_target(df) |
| |
| return df, kmeans_model |
|
|
| if __name__ == "__main__": |
| |
| pass |
|
|