{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# 6. Análisis de sentimiento" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "El análisis de sentimiento es una técnica de procesamiento de lenguaje natural (NLP) que permite determinar la actitud de un autor respecto a un tema o la polaridad de una opinión." ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "import re\n", "\n", "from sentiment_analysis_spanish import sentiment_analysis" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tweethashtagsmentionslinks
13099El LG_ES G3 S ya a la venta por 299 euros -012
32607No habrá ningún desahucio más en Madrid sin al...002
69261El partido socialista derogará la reforma labo...110
59548.mdcospedal inicia la campaña electoral del PP...011
21659BuenosDíasUAM!!! \"La clave de la educación no ...300
...............
3344Nada más lindo q estar en casa :)000
64788|| Yo no me entero de que ha pasado. :D000
69124Napolitanas con fritas para el bajon :P000
34177Buenas noches. 💜001
2680A las 12:30 conferencia de marianorajoy en el ...131
\n", "

7019 rows × 4 columns

\n", "
" ], "text/plain": [ " tweet hashtags mentions \\\n", "13099 El LG_ES G3 S ya a la venta por 299 euros - 0 1 \n", "32607 No habrá ningún desahucio más en Madrid sin al... 0 0 \n", "69261 El partido socialista derogará la reforma labo... 1 1 \n", "59548 .mdcospedal inicia la campaña electoral del PP... 0 1 \n", "21659 BuenosDíasUAM!!! \"La clave de la educación no ... 3 0 \n", "... ... ... ... \n", "3344 Nada más lindo q estar en casa :) 0 0 \n", "64788 || Yo no me entero de que ha pasado. :D 0 0 \n", "69124 Napolitanas con fritas para el bajon :P 0 0 \n", "34177 Buenas noches. 💜 0 0 \n", "2680 A las 12:30 conferencia de marianorajoy en el ... 1 3 \n", "\n", " links \n", "13099 2 \n", "32607 2 \n", "69261 0 \n", "59548 1 \n", "21659 0 \n", "... ... \n", "3344 0 \n", "64788 0 \n", "69124 0 \n", "34177 1 \n", "2680 1 \n", "\n", "[7019 rows x 4 columns]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tweets = pd.read_csv('https://raw.githubusercontent.com/garnachod/TwitterSentimentDataset/master/tweets_clean.txt', header=None, sep='\\t')\n", "tweets = tweets.sample(frac=0.1, random_state=42)\n", "tweets.columns = ['tweet']\n", "\n", "# feature engineering, extract #, @, and links\n", "tweets['hashtags'] = tweets['tweet'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))\n", "tweets['mentions'] = tweets['tweet'].apply(lambda x: len([x for x in x.split() if x.startswith('@')]))\n", "tweets['links'] = tweets['tweet'].apply(lambda x: len([x for x in x.split() if x.startswith('http')]))\n", "\n", "# remove #, @, and links\n", "tweets['tweet'] = tweets['tweet'].str.replace('#', '')\n", "tweets['tweet'] = tweets['tweet'].str.replace('@', '')\n", "tweets['tweet'] = tweets['tweet'].apply(lambda x: re.sub(r'http\\S+|www.\\S+', '', x, flags=re.MULTILINE))\n", "\n", "tweets" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/alejo/.local/lib/python3.11/site-packages/sklearn/base.py:348: InconsistentVersionWarning: Trying to unpickle estimator CountVectorizer from version 0.23.2 when using version 1.3.2. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:\n", "https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations\n", " warnings.warn(\n", "/home/alejo/.local/lib/python3.11/site-packages/sklearn/base.py:348: InconsistentVersionWarning: Trying to unpickle estimator MultinomialNB from version 0.23.2 when using version 1.3.2. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:\n", "https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations\n", " warnings.warn(\n" ] } ], "source": [ "sentiment = sentiment_analysis.SentimentAnalysisSpanish()\n", "tweets['sentiment'] = tweets['tweet'].apply(lambda x: sentiment.sentiment(x))" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.9094362772997505" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sentiment.sentiment(\"Me encanta el fútbol\")" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2.623559185538157e-05" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sentiment.sentiment(\"El producto, un desastre\")\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.7" } }, "nbformat": 4, "nbformat_minor": 2 }