# Installing the necessary libraries
!pip install git+https://github.com/huggingface/transformers
!pip install jupyterlab ipywidgets bertviz xformers evaluate matplotlib

# Importing libraries
import numpy as np
import matplotlib.pyplot as plt
from transformers import BertModel, BertTokenizer, pipeline
from bertviz import model_view

# Tokenization and Model Setup for BERT
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# Tokenizing a sample text
sample_text = "I live in Montreal and I like this city."
tokenized_output = tokenizer(sample_text)
print("Tokenized IDs:", tokenized_output["input_ids"])
print("Tokens:", tokenizer.convert_ids_to_tokens(tokenized_output["input_ids"]))

# Positional Encoding Function
def positional_encoding(num_tokens, dimensions, scaling_factor=10000):
    """
    Calculates the positional encoding for a given number of tokens and dimensions.
    Args:
    - num_tokens: Number of tokens in the sequence.
    - dimensions: Dimensions of the model.
    - scaling_factor: Scaling factor used in the encoding calculations.
    
    Returns:
    - A numpy array containing the positional encodings for each token.
    """
    position = np.zeros((num_tokens, dimensions))
    for i in range(num_tokens):
        for j in range(0, dimensions, 2):
            position[i, j] = np.sin(i / (scaling_factor ** (j / dimensions)))
            position[i, j + 1] = np.cos(i / (scaling_factor ** ((j + 1) / dimensions)))
    return position

# Visualizing Positional Encoding
pos_encoding = positional_encoding(100, 256)
plt.figure(figsize=(10, 8))
plt.matshow(pos_encoding, cmap='viridis')
plt.colorbar()
plt.title("Positional Encoding")
plt.xlabel("Dimensions")
plt.ylabel("Token Position")
plt.show()

# Self-Attention Visualization with BertViz
sample_text = "I live in Montreal and I like this city."
model_view(model, model_name, tokenizer, sample_text, display_mode="dark", head=9)
model_view(model, model_name, tokenizer, sample_text, display_mode="light", head=9)

# Text Generation with GPT-2
gpt2_generator = pipeline('text-generation', model='gpt2')
print(gpt2_generator("I enjoy hiking and running.", max_length=20, num_return_sequences=3))
print(gpt2_generator("Montreal is a good place for sports.", max_length=50, num_return_sequences=5))


# Sentiment Analysis
##postive result
from transformers import pipeline
sentiment_pipeline = pipeline("sentiment-analysis")
print(sentiment_pipeline("I love Montreal, it's a fantastic city!"))
# Sentiment Analysis
##negative result
from transformers import pipeline
sentiment_pipeline = pipeline("sentiment-analysis")
print(sentiment_pipeline("I hate fighting."))


#Word Embedding visualization
## Extract embeddings
tokenized_text = tokenizer_bert("I live in Montreal and I like this city.", return_tensors="pt")
with torch.no_grad():
    outputs = model_bert(**tokenized_text)
embeddings = outputs.last_hidden_state.squeeze().numpy()

## Use t-SNE for dimensionality reduction
tsne_model = TSNE(perplexity=10, n_components=2, init='pca', n_iter=2500, random_state=23)
new_values = tsne_model.fit_transform(embeddings)

## Plotting 
df = pd.DataFrame(new_values, columns=['x', 'y'])
df['token'] = tokenizer_bert.convert_ids_to_tokens(tokenized_text['input_ids'].squeeze().tolist())

plt.figure(figsize=(8, 6))
for i, token in enumerate(df['token']):
    plt.scatter(df.iloc[i]['x'], df.iloc[i]['y'])
    plt.annotate(token, xy=(df.iloc[i]['x'], df.iloc[i]['y']), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom')
plt.show()
##################