import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.sparse import csc_matrix

from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules


# Import the dataset
ds = pd.read_excel("assets/data/Analyzing_Consumer_Behaviour_Data.xlsx")

# Explore the dataset
ds.head(200)


# Check for missing values
ds.isnull().sum()

BillNo             0
Itemname        1455
Quantity           0
Date               0
Price              0
CustomerID    134041
Country            0
dtype: int64


# Drop the rows with missing values
ds.dropna(axis=0, inplace=True)

# Verify if the rows with missing values were dropped
ds.isnull().sum()

BillNo        0
Itemname      0
Quantity      0
Date          0
Price         0
CustomerID    0
Country       0
dtype: int64


# Convert the "Date" column to a datetime format
ds["Date"] = pd.to_datetime(ds["Date"])


# Verify the data type of columns   
ds.dtypes

BillNo                object
Itemname              object
Quantity               int64
Date          datetime64[ns]
Price                float64
CustomerID           float64
Country               object
dtype: object


# Visualize data distribution (Quantity and Price)
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
sns.histplot(ds['Quantity'], bins=20, kde=True)
plt.title('Distribution of Quantity')
plt.xlabel('Quantity')
plt.ylabel('Frequency')

plt.subplot(1, 2, 2)
sns.histplot(ds['Price'], bins=20, kde=True)
plt.title('Distribution of Price')
plt.xlabel('Price')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()


# Visualize sales trends over time
ds['Date'] = pd.to_datetime(ds['Date'])  # Convert 'Date' column to datetime

# Extract month and year from the 'Date' column
ds['Year'] = ds['Date'].dt.year
ds['Month'] = ds['Date'].dt.month

# Group data by month and calculate total sales
monthly_sales = ds.groupby(['Year', 'Month'])['Price'].sum().reset_index()

# Plot sales trends over time
plt.figure(figsize=(12, 6))
sns.lineplot(data=monthly_sales, x='Month', y='Price', hue='Year')
plt.title('Monthly Sales Trends')
plt.xlabel('Month')
plt.ylabel('Total Sales')
plt.xticks(range(1, 13), labels=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])

plt.legend(title='Year')
plt.show()


# Visualize the number of transactions by country
plt.figure(figsize=(12, 6))
sns.barplot(x=ds['Country'].value_counts().index, y=ds['Country'].value_counts().values)
plt.title('Number of Transactions by Country')
plt.xlabel('Country')
plt.xticks(rotation=90)
plt.ylabel('Number of Transactions')

plt.tight_layout()
plt.show()


# Group data by 'BillNo' and 'Itemname'
transaction_data = pd.crosstab(index=ds['BillNo'], columns=ds['Itemname'])

# Convert the pivot table to a binary transaction dataset
def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1

transaction_data = transaction_data.applymap(encode_units)

# Verify the transaction dataset
transaction_data.head(10)


# Convert the sparse matrix back to a pandas DataFrame
sparse_transaction_data = csc_matrix(transaction_data.values)
sparse_transaction_df = pd.DataFrame.sparse.from_spmatrix(sparse_transaction_data, columns=transaction_data.columns)

# Convert the dataframe to boolean values with SparseDtype
sparse_transaction_df = sparse_transaction_df.astype(pd.SparseDtype(bool))

# Apply the Apriori algorithm
# Set the minimum support and confidence threshold
min_support = 0.01 # Set your desired support threshold
min_confidence = 0.5 # Set your desired confidence threshold

# Generate frequent item sets
frequent_itemsets = apriori(sparse_transaction_df, min_support=min_support, use_colnames=True)

# Generate association rules
rules = association_rules(frequent_itemsets, metric='lift', min_threshold=min_confidence)

# Display association rules with support, confidence, and lift
rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']]


# Assess the rules with minimum support, confidence, and lift to identify meaningful associations
min_lift = 1.0 # Set your desired lift threshold

# Filter rules based on lift threshold
filtered_rules = rules[
    (rules['support'] >= min_support) &
    (rules['confidence'] >= min_confidence) &
    (rules['lift'] >= min_lift)
    ]

# Sort and select relevant rules
# Depending on your business objective, you may sort the rules by different metrics
sorted_rules = filtered_rules.sort_values(by='lift', ascending=False)

# Display the top 10 rules
sorted_rules.head(10)


# Scatter plot of association rules
plt.figure(figsize=(10, 6))
plt.scatter(rules['support'], rules['confidence'], alpha=0.5)
plt.xlabel('Support')
plt.ylabel('Confidence')
plt.title('Association Rules - Support vs. Confidence')
plt.show()


# Create a directed graph
G = nx.DiGraph()

# Add nodes for antecedents and consequents
G.add_nodes_from(rules['antecedents'], color='blue', size=200)
G.add_nodes_from(rules['consequents'], color='red', size=200)

# Add edges between antecedents and consequents
for index, row in rules.iterrows():
    G.add_edge(row['antecedents'], row['consequents'], weight=row['lift'])

# Draw the network graph
plt.figure(figsize=(25, 25))
pos = nx.spring_layout(G, k=0.5)
colors = [G[u][v]['weight'] for u, v in G.edges()]
sizes = [300 if 'color' not in node else 100 for node in G.nodes(data=True)]
nx.draw(G, pos, edge_color=colors, node_size=sizes, with_labels=True, font_size=10)
plt.title('Association Rules - Network Graph')
plt.show()

	BillNo	Itemname	Quantity	Date	Price	CustomerID	Country
0	536365	WHITE HANGING HEART T-LIGHT HOLDER	6	2010-12-01 08:26:00	2.55	17850.0	United Kingdom
1	536365	WHITE METAL LANTERN	6	2010-12-01 08:26:00	3.39	17850.0	United Kingdom
2	536365	CREAM CUPID HEARTS COAT HANGER	8	2010-12-01 08:26:00	2.75	17850.0	United Kingdom
3	536365	KNITTED UNION FLAG HOT WATER BOTTLE	6	2010-12-01 08:26:00	3.39	17850.0	United Kingdom
4	536365	RED WOOLLY HOTTIE WHITE HEART.	6	2010-12-01 08:26:00	3.39	17850.0	United Kingdom
...	...	...	...	...	...	...	...
195	536389	CHRISTMAS LIGHTS 10 REINDEER	6	2010-12-01 10:03:00	8.50	12431.0	Australia
196	536389	VINTAGE UNION JACK CUSHION COVER	8	2010-12-01 10:03:00	4.95	12431.0	Australia
197	536389	VINTAGE HEADS AND TAILS CARD GAME	12	2010-12-01 10:03:00	1.25	12431.0	Australia
198	536389	SET OF 3 COLOURED FLYING DUCKS	6	2010-12-01 10:03:00	5.45	12431.0	Australia
199	536389	SET OF 3 GOLD FLYING DUCKS	4	2010-12-01 10:03:00	6.35	12431.0	Australia

Itemname	10 COLOUR SPACEBOY PEN	12 COLOURED PARTY BALLOONS	12 DAISY PEGS IN WOOD BOX	12 EGG HOUSE PAINTED WOOD	12 HANGING EGGS HAND PAINTED	12 IVORY ROSE PEG PLACE SETTINGS	12 MESSAGE CARDS WITH ENVELOPES	12 PENCIL SMALL TUBE WOODLAND	12 PENCILS SMALL TUBE RED RETROSPOT	12 PENCILS SMALL TUBE SKULL	...	ZINC STAR T-LIGHT HOLDER	ZINC SWEETHEART SOAP DISH	ZINC SWEETHEART WIRE LETTER RACK	ZINC T-LIGHT HOLDER STAR LARGE	ZINC T-LIGHT HOLDER STARS LARGE	ZINC T-LIGHT HOLDER STARS SMALL	ZINC TOP 2 DOOR WOODEN SHELF	ZINC WILLIE WINKIE CANDLE STICK	ZINC WIRE KITCHEN ORGANISER	ZINC WIRE SWEETHEART LETTER TRAY
BillNo
536365	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
536366	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
536367	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
536368	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
536369	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
536370	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
536371	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
536372	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
536373	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
536374	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0

	antecedents	consequents	support	confidence	lift
0	(60 CAKE CASES DOLLY GIRL DESIGN)	(PACK OF 72 RETROSPOT CAKE CASES)	0.010075	0.543027	9.942534
1	(PACK OF 72 RETROSPOT CAKE CASES)	(60 CAKE CASES DOLLY GIRL DESIGN)	0.010075	0.184476	9.942534
2	(72 SWEETHEART FAIRY CAKE CASES)	(60 TEATIME FAIRY CAKE CASES)	0.011617	0.439583	12.673257
3	(60 TEATIME FAIRY CAKE CASES)	(72 SWEETHEART FAIRY CAKE CASES)	0.011617	0.334921	12.673257
4	(PACK OF 60 DINOSAUR CAKE CASES)	(60 TEATIME FAIRY CAKE CASES)	0.011782	0.419608	12.097361
...	...	...	...	...	...
933	(LUNCH BAG PINK POLKADOT, LUNCH BAG CARS BLUE)	(LUNCH BAG BLACK SKULL., LUNCH BAG RED RETROS...	0.010461	0.451306	16.135981
934	(LUNCH BAG BLACK SKULL.)	(LUNCH BAG RED RETROSPOT, LUNCH BAG PINK POLKA...	0.010461	0.182868	12.166425
935	(LUNCH BAG RED RETROSPOT)	(LUNCH BAG BLACK SKULL., LUNCH BAG PINK POLKA...	0.010461	0.150079	10.606555
936	(LUNCH BAG PINK POLKADOT)	(LUNCH BAG BLACK SKULL., LUNCH BAG RED RETROS...	0.010461	0.207877	14.634412
937	(LUNCH BAG CARS BLUE)	(LUNCH BAG BLACK SKULL., LUNCH BAG RED RETROS...	0.010461	0.200422	12.093899

	antecedents	consequents	antecedent support	consequent support	support	confidence	lift	leverage	conviction	zhangs_metric
529	(REGENCY SUGAR BOWL GREEN)	(REGENCY MILK JUG PINK)	0.013269	0.013489	0.010130	0.763485	56.600762	0.009952	4.171038	0.995542
528	(REGENCY MILK JUG PINK)	(REGENCY SUGAR BOWL GREEN)	0.013489	0.013269	0.010130	0.751020	56.600762	0.009952	3.963101	0.995764
531	(REGENCY TEA PLATE GREEN)	(REGENCY TEA PLATE ROSES)	0.013544	0.016572	0.011397	0.841463	50.775748	0.011172	6.203160	0.993765
530	(REGENCY TEA PLATE ROSES)	(REGENCY TEA PLATE GREEN)	0.016572	0.013544	0.011397	0.687708	50.775748	0.011172	3.158758	0.996825
504	(POPPY'S PLAYHOUSE BEDROOM)	(POPPY'S PLAYHOUSE LIVINGROOM)	0.016902	0.013434	0.010901	0.644951	48.009211	0.010674	2.778677	0.996006
505	(POPPY'S PLAYHOUSE LIVINGROOM)	(POPPY'S PLAYHOUSE BEDROOM)	0.013434	0.016902	0.010901	0.811475	48.009211	0.010674	5.214691	0.992504
560	(SET/6 RED SPOTTY PAPER PLATES)	(SET/6 RED SPOTTY PAPER CUPS)	0.017398	0.015306	0.012608	0.724684	47.346860	0.012342	3.576590	0.996211
561	(SET/6 RED SPOTTY PAPER CUPS)	(SET/6 RED SPOTTY PAPER PLATES)	0.015306	0.017398	0.012608	0.823741	47.346860	0.012342	5.574762	0.994095
507	(POPPY'S PLAYHOUSE LIVINGROOM)	(POPPY'S PLAYHOUSE KITCHEN)	0.013434	0.018499	0.011452	0.852459	46.080991	0.011203	6.652395	0.991620
506	(POPPY'S PLAYHOUSE KITCHEN)	(POPPY'S PLAYHOUSE LIVINGROOM)	0.018499	0.013434	0.011452	0.619048	46.080991	0.011203	2.589736	0.996738

Analyzing Consumer Behavior¶

Import libraries¶

Data Preprocessing¶

Exploratory Data Analysis (EDA)¶

Transaction Data Preparation¶

Association Rule Mining¶

Rule Evaluation and Selection¶

Visualization¶

Business Insights¶

Conclusion¶