In [6]:
import os
import requests
import re
import ast
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from nltk.corpus import names, stopwords, words
from nltk.tokenize import word_tokenize
from rake_nltk import Rake
import nltk
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer
Get Data¶
In [9]:
def get_news(q, df=True, from_date=None, to_date=None, language=None, searchIn=None, page_size=None, page=None):
endpoint = 'https://newsapi.org/v2/everything'
params = {
'apiKey': os.getenv('NEWS_API_KEY'),
'pageSize': page_size,
'page': page
}
query_params = {
'q': q,
'from': from_date,
'to': to_date,
'language': language,
'searchIn': searchIn
}
for key, value in query_params.items():
if value is not None:
params[key] = value
response = requests.get(endpoint, params=params)
response.raise_for_status()
print(response.url)
if df:
return pd.json_normalize(pd.DataFrame(response.json())['articles'])
else:
return response.json()
In [11]:
df = get_news(
q=f"{'water+climate+change+river+drinking'}",
df=True,
from_date='2024-09-08',
to_date='2024-10-08',
language='en',
page_size=100,
page=1
)
display(df)
https://newsapi.org/v2/everything?apiKey=935933a6b0ce488f86af5022580c4f62&pageSize=100&page=1&q=water%2Bclimate%2Bchange%2Briver%2Bdrinking&from=2024-09-08&to=2024-10-08&language=en
author | title | description | url | urlToImage | publishedAt | content | source.id | source.name | |
---|---|---|---|---|---|---|---|---|---|
0 | Lauren Sommer | Hurricanes are dangerous far from the coast. C... | Coastal cities often bear the brunt of hurrica... | https://www.npr.org/2024/10/01/nx-s1-5133530/h... | https://npr.brightspotcdn.com/dims3/default/st... | 2024-10-02T09:00:00Z | Hurricane Helenes destructive path tore across... | None | NPR |
1 | None | New tool to help decision-makers navigate poss... | The Colorado River is a vital source of water ... | https://www.sciencedaily.com/releases/2024/09/... | https://www.sciencedaily.com/images/scidaily-i... | 2024-09-20T20:08:11Z | The Colorado River is a vital source of water ... | None | Science Daily |
2 | Matthew Carroll | New tool to help decision makers navigate poss... | The Colorado River is a vital source of water ... | https://phys.org/news/2024-09-tool-decision-ma... | https://scx2.b-cdn.net/gfx/news/hires/2024/new... | 2024-09-21T15:05:35Z | The Colorado River is a vital source of water ... | None | Phys.Org |
3 | Ariel Wittenberg, E&E News | Hurricanes Helene’s Floods Swamped a Hospital,... | Hurricane Helene forced dozens of medical faci... | https://subscriber.politicopro.com/article/een... | https://static.scientificamerican.com/dam/m/78... | 2024-10-02T18:45:00Z | CLIMATEWIRE | A dramatic helicopter evacuation... | None | Politicopro.com |
4 | Al Jazeera | Water levels in major Amazon tributary tumble ... | Climate change and below-average rainfall have... | https://www.aljazeera.com/gallery/2024/10/5/wa... | https://www.aljazeera.com/wp-content/uploads/2... | 2024-10-05T01:49:16Z | Its one of the largest rivers in the world. An... | al-jazeera-english | Al Jazeera English |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
91 | Dariel Pradas | A Cuban Town Improves Water Quality Through De... | Overnight, hundreds of people in the rural com... | https://www.ipsnews.net/2024/09/cuban-town-imp... | https://www.ipsnews.net/Library/2024/09/Agua-1... | 2024-09-09T15:44:14Z | Editors' Choice, Featured, Headlines, Health, ... | None | Inter Press Service |
92 | Khabarhub | Langtang National Park: Understanding climate ... | Langtang National Park — Nepal’s High Mountain... | https://english.khabarhub.com/2024/18/397905/ | https://english.khabarhub.com/wp-content/uploa... | 2024-09-18T06:15:48Z | Langtang National Park — Nepal’s High Mountain... | None | Khabarhub.com |
93 | Oritro Karim | Typhoon Yagi Devastates Southeast Asia | In early September, Typhoon Yagi, a deadly tro... | https://www.ipsnews.net/2024/09/typhoon-yagi-d... | https://www.ipsnews.net/Library/2024/09/The-af... | 2024-09-19T09:57:08Z | Asia-Pacific, Climate Change, Economy & Tr... | None | Inter Press Service |
94 | Dima Al-Khatib | A Better Tomorrow with South-South Cooperation | The annual United Nations Day for South-South ... | https://www.ipsnews.net/2024/09/better-tomorro... | https://www.ipsnews.net/Library/2024/09/A-Bett... | 2024-09-12T06:17:13Z | Civil Society, Development & Aid, Economy ... | None | Inter Press Service |
95 | Rina Mukherji | Rejuvenating Tradition Help Save Ancient Engin... | Dhamapur is a small village in Malvan taluka o... | https://www.ipsnews.net/2024/10/rejuvenating-t... | https://www.ipsnews.net/Library/2024/10/Dhamap... | 2024-10-02T09:50:31Z | Arts, Asia-Pacific, Civil Society, Conservatio... | None | Inter Press Service |
96 rows × 9 columns
In [12]:
# Function to parse the content of each article
def parse_article_content(url):
print(url)
try:
response = requests.get(url, timeout=10) # Set a timeout of 10 seconds
response.raise_for_status()
except requests.exceptions.RequestException as e:
print(e)
return None
soup = BeautifulSoup(response.content, 'html.parser')
# Extract the text from the article
paragraphs = soup.find_all('p')
article_text = ' '.join([para.get_text() for para in paragraphs])
words = re.findall(r'\b\w+\b', article_text)
return words
# Apply the function to each URL in the dataframe
df['parsed_content'] = df['url'].apply(parse_article_content)
https://www.npr.org/2024/10/01/nx-s1-5133530/hurricane-helene-rain-flooding-climate-change https://www.sciencedaily.com/releases/2024/09/240920160811.htm 403 Client Error: Forbidden for url: https://www.sciencedaily.com/releases/2024/09/240920160811.htm https://phys.org/news/2024-09-tool-decision-makers-futures-colorado.html 400 Client Error: Bad request for url: https://phys.org/news/2024-09-tool-decision-makers-futures-colorado.html https://subscriber.politicopro.com/article/eenews/2024/10/02/a-dangerous-rescue-helene-made-a-hospital-almost-inescapable-00181734 https://www.aljazeera.com/gallery/2024/10/5/water-levels-in-major-amazon-tributary-tumble-to-record-lows-amid-drought https://phys.org/news/2024-09-amazon-river-dries-hellish-villagers.html 400 Client Error: Bad request for url: https://phys.org/news/2024-09-amazon-river-dries-hellish-villagers.html https://www.aljazeera.com/news/2024/9/19/storm-boris-floods-northern-italy-as-leaders-to-discuss-eu-aid-in-poland https://phys.org/news/2024-09-nepal-surveys-wreckage-death-toll.html 400 Client Error: Bad request for url: https://phys.org/news/2024-09-nepal-surveys-wreckage-death-toll.html https://www.nakedcapitalism.com/2024/09/how-cities-run-dry.html https://phys.org/news/2024-09-city.html 400 Client Error: Bad request for url: https://phys.org/news/2024-09-city.html https://www.poynter.org/reporting-editing/2024/climate-change-stories-are-everywhere-heres-how-to-find-them/ 403 Client Error: Forbidden for url: https://www.poynter.org/reporting-editing/2024/climate-change-stories-are-everywhere-heres-how-to-find-them/ https://www.forbes.com/sites/kensilverstein/2024/10/07/heat-and-drought-imperil-the-panama-canal-and-adjoining-rainforests/ https://www.project-syndicate.org/magazine/water-security-crisis-nature-of-the-problem-and-what-to-do-by-mariana-mazzucato-et-al-2024-09 https://www.bbc.com/news/articles/cd6qvpe0dxqo?xtor=AL-72-%5Bpartner%5D-%5Byahoo.north.america%5D-%5Bheadline%5D-%5Bnews%5D-%5Bbizdev%5D-%5Bisapi%5D http://grist.org/drought/in-arid-new-mexico-a-debate-over-reusing-oil-industry-wastewater/ https://phys.org/news/2024-09-gateway-arctic-fat-ice-polar.html 400 Client Error: Bad request for url: https://phys.org/news/2024-09-gateway-arctic-fat-ice-polar.html https://thediplomat.com/2024/09/death-toll-climbs-to-199-in-vietnam-as-typhoons-aftermath-brings-flash-floods-and-landslides/ https://www.forbes.com/sites/amyfeldman/2024/09/13/a-new-technology-for-getting-rid-of-forever-chemicals-adam-neumann-clean-energy-transition/ https://phys.org/news/2024-09-world-million-tons-plastic-pollution.html 400 Client Error: Bad request for url: https://phys.org/news/2024-09-world-million-tons-plastic-pollution.html https://www.ibtimes.com/drowned-hurricane-remote-ncarolina-towns-now-struggle-water-3745739 403 Client Error: Forbidden for url: https://www.ibtimes.com/drowned-hurricane-remote-ncarolina-towns-now-struggle-water-3745739 https://removed.com HTTPSConnectionPool(host='removed.com', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLError(1, '[SSL: WRONG_VERSION_NUMBER] wrong version number (_ssl.c:1002)'))) https://www.voanews.com/a/chad-floods-kill-503-affect-1-7-million-people-un-says-/7793700.html https://www.rawstory.com/nepal-surveys-flood-wreckage-as-toll-reaches-198/ 403 Client Error: Forbidden for url: https://www.rawstory.com/nepal-surveys-flood-wreckage-as-toll-reaches-198/ https://removed.com HTTPSConnectionPool(host='removed.com', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLError(1, '[SSL: WRONG_VERSION_NUMBER] wrong version number (_ssl.c:1002)'))) https://www.ibtimes.com/nepal-surveys-flood-wreckage-death-toll-reaches-200-3744951 403 Client Error: Forbidden for url: https://www.ibtimes.com/nepal-surveys-flood-wreckage-death-toll-reaches-200-3744951 https://removed.com HTTPSConnectionPool(host='removed.com', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLError(1, '[SSL: WRONG_VERSION_NUMBER] wrong version number (_ssl.c:1002)'))) https://www.voanews.com/a/vietnam-death-toll-climbs-to-199-as-typhoon-aftermath-brings-flash-floods-landslides/7781246.html https://www.yahoo.com/news/amazon-river-dries-creating-hellish-195035044.html https://gcaptain.com/brazils-coastline-faces-accelerated-erosion-as-atlantic-advances/ https://www.bbc.com/news/articles/cd6qvpe0dxqo https://www.insurancejournal.com/news/southeast/2024/10/04/795811.htm https://www.irishtimes.com/environment/2024/10/04/new-59m-reservoir-will-enhance-security-and-resilience-of-water-supply-across-greater-dublin-area/ https://abcnews.go.com/International/wireStory/brazils-worst-drought-wildfires-rage-amazon-river-falls-113540209 404 Client Error: Not Found for url: https://abcnews.go.com/International/wireStory/brazils-worst-drought-wildfires-rage-amazon-river-falls-113540209 https://www.independent.ie/irish-news/experts-say-irish-economy-will-lose-billions-to-climate-change-amid-failure-to-prepare/a1807742926.html https://punchng.com/ngo-seeks-support-for-borno-flood-victims/ https://ca.news.yahoo.com/imperial-oil-could-shutter-norman-001241672.html https://economictimes.indiatimes.com/news/india/india-eu-agree-to-enhance-cooperation-in-sustainable-water-management/articleshow/113463895.cms https://www.cbc.ca/news/canada/edmonton/site-c-dam-bc-hydro-alberta-peace-river-1.7322035 HTTPSConnectionPool(host='www.cbc.ca', port=443): Read timed out. (read timeout=10) https://mymodernmet.com/independent-photographer-travel-photo-awards/ https://www.investing.com/news/world-news/depth-of-major-amazon-tributary-in-brazil-drops-to-record-low-in-severe-drought-3641797 403 Client Error: Forbidden for url: https://www.investing.com/news/world-news/depth-of-major-amazon-tributary-in-brazil-drops-to-record-low-in-severe-drought-3641797 https://www.digitaljournal.com/world/drowned-by-hurricane-remote-n-carolina-towns-now-struggle-for-water/article 403 Client Error: Forbidden for url: https://www.digitaljournal.com/world/drowned-by-hurricane-remote-n-carolina-towns-now-struggle-for-water/article https://abcnews.go.com/Technology/wireStory/gateway-arctic-fat-ice-polar-bears-crucial-trouble-113957248 https://www.bostonherald.com/2024/09/27/wildfire-replanting-efforts/ https://cleantechnica.com/2024/09/25/risk-fingerprints-gray-rhinos-help-communication-strategy/ 403 Client Error: Forbidden for url: https://cleantechnica.com/2024/09/25/risk-fingerprints-gray-rhinos-help-communication-strategy/ https://abcnews.go.com/US/wireStory/after-storms-francine-new-orleans-rushes-dry-113675709 404 Client Error: Not Found for url: https://abcnews.go.com/US/wireStory/after-storms-francine-new-orleans-rushes-dry-113675709 https://punchng.com/mr-president-was-the-borno-flood-a-natural-disaster/ https://www.commondreams.org/opinion/appalachia-climate-crisis 403 Client Error: Forbidden for url: https://www.commondreams.org/opinion/appalachia-climate-crisis https://thehillstimes.in/international/vietnam-death-toll-climbs-to-199-as-typhoons-aftermath-brings-flash-floods-and-landslides https://www.abc.net.au/news/2024-09-11/walgett-namoi-river-water-testing-reveals-metals-pesticides/104329362 https://abcnews.go.com/Health/wireStory/amazon-ashaninka-tribe-restored-territory-now-aim-change-113620295 404 Client Error: Not Found for url: https://abcnews.go.com/Health/wireStory/amazon-ashaninka-tribe-restored-territory-now-aim-change-113620295 https://iasexamportal.com/upsc-mains/papers/2024-general-studies-paper-4 https://www.yahoo.com/news/many-forests-fail-recover-wildfires-133105574.html https://slate.com/life/2024/10/asheville-floods-trees-art-damage-death-dams.html http://deadline.com/2024/09/micheaux-film-festival-2024-viola-davis-michael-ealy-morris-chestnut-michael-gandolfini-1236090280/ https://www.americanthinker.com/blog/2024/09/indigenous_communities_sound_the_alarm_on_the_environmental_destruction_caused_by_surge_of_illegals_invited_through_biden_harris_open_border.html https://www.pbs.org/newshour/world/brazil-faces-its-worst-drought-as-wildfires-rage-and-amazon-river-falls-to-record-low https://www.forbes.com/sites/kailayu/2024/09/30/57-irresistible-pastas-from-restaurants-worldwide-for-pasta-month/ https://thehillstimes.in/international/rescue-operations-continue-as-over-200-people-killed-in-floods-landslides-in-nepal https://www.business-standard.com/world-news/amid-drought-in-brazil-wildfires-rage-amazon-level-falls-to-record-low-124091000052_1.html 403 Client Error: Forbidden for url: https://www.business-standard.com/world-news/amid-drought-in-brazil-wildfires-rage-amazon-level-falls-to-record-low-124091000052_1.html https://www.business-standard.com/world-news/vietnam-toll-climbs-to-199-as-yagi-aftermath-brings-flash-flood-landslides-124091200601_1.html 403 Client Error: Forbidden for url: https://www.business-standard.com/world-news/vietnam-toll-climbs-to-199-as-yagi-aftermath-brings-flash-flood-landslides-124091200601_1.html https://removed.com HTTPSConnectionPool(host='removed.com', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLError(1, '[SSL: WRONG_VERSION_NUMBER] wrong version number (_ssl.c:1002)'))) https://www.ndtv.com/world-news/rescue-operations-continue-as-over-200-people-killed-in-floods-landslides-in-nepal-6683768 https://removed.com HTTPSConnectionPool(host='removed.com', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLError(1, '[SSL: WRONG_VERSION_NUMBER] wrong version number (_ssl.c:1002)'))) https://www.globenewswire.com/news-release/2024/09/24/2952352/0/en/Water-Quality-Instruments-Market-is-Exhibiting-a-CAGR-of-7-10-during-the-forecast-period-Exactitude-Consultancy.html https://journals.plos.org/plosntds/article?id=10.1371/journal.pntd.0012139 https://www.cbc.ca/news/canada/manitoba/trouble-arctic-ecosystem-polar-bears-1.7332545 HTTPSConnectionPool(host='www.cbc.ca', port=443): Read timed out. (read timeout=10) https://www.globenewswire.com/news-release/2024/10/03/2957680/0/en/Beyond-Green-Welcomes-New-Member-Properties-in-Africa-Europe-and-Latin-America-Thoughtfully-Expanding-its-Global-Footprint.html https://financialpost.com/pmn/as-many-forests-fail-to-recover-from-wildfires-replanting-efforts-face-huge-odds-and-obstacles 403 Client Error: Forbidden for url: https://financialpost.com/pmn/as-many-forests-fail-to-recover-from-wildfires-replanting-efforts-face-huge-odds-and-obstacles https://bmjopen.bmj.com/content/14/9/e083624 https://abcnews.go.com/Business/wireStory/forests-fail-recover-wildfires-replanting-efforts-face-huge-114275681 https://www.pbs.org/newshour/world/research-reveals-threats-polar-bears-face-as-climate-change-melts-arctic-ice-hunting-grounds https://removed.com HTTPSConnectionPool(host='removed.com', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLError(1, '[SSL: WRONG_VERSION_NUMBER] wrong version number (_ssl.c:1002)'))) https://www.theatlantic.com/ideas/archive/2024/09/michael-regan-epa-environmental-justice-lawsuit/679941/ https://www.bangkokpost.com/life/social-and-lifestyle/2866857/navigating-rain-bombs https://slate.com/technology/2024/10/hurricane-helene-destruction-north-carolina-florida-georgia-climate-change.html https://richmond.com/news/nation-world/israel-fights-on-two-fronts/article_62edc161-35aa-52fe-b765-1fa0cface714.html https://removed.com HTTPSConnectionPool(host='removed.com', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLError(1, '[SSL: WRONG_VERSION_NUMBER] wrong version number (_ssl.c:1002)'))) https://removed.com HTTPSConnectionPool(host='removed.com', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLError(1, '[SSL: WRONG_VERSION_NUMBER] wrong version number (_ssl.c:1002)'))) https://journals.plos.org/plosntds/article?id=10.1371/journal.pntd.0012483 https://roanoke.com/news/nation-world/israel-fights-on-two-fronts/article_fd2c97b3-87b7-5629-beef-cfdb9520d6bf.html 429 Client Error: Too Many Requests for url: https://roanoke.com/news/nation-world/israel-fights-on-two-fronts/article_fd2c97b3-87b7-5629-beef-cfdb9520d6bf.html https://www.seattlepi.com/news/world/article/amid-the-worst-drought-in-brazil-history-19753496.php 403 Client Error: Forbidden for url: https://www.seattlepi.com/news/world/article/amid-the-worst-drought-in-brazil-history-19753496.php https://www.seattlepi.com/news/article/ap-photos-hallmarks-of-climate-change-seen-in-19771941.php 403 Client Error: Forbidden for url: https://www.seattlepi.com/news/article/ap-photos-hallmarks-of-climate-change-seen-in-19771941.php https://www.ibtimes.com.au/nepal-surveys-flood-wreckage-death-toll-reaches-200-1852011 403 Client Error: Forbidden for url: https://www.ibtimes.com.au/nepal-surveys-flood-wreckage-death-toll-reaches-200-1852011 https://journals.plos.org/plosntds/article?id=10.1371/journal.pntd.0012508 https://www.resilience.org/stories/2024-09-26/hurricanes-dont-stop-at-the-coast-these-mountain-towns-know-how-severe-inland-flood-damage-can-be-and-theyre-watching-helene/ https://www.seattlepi.com/news/world/article/in-the-amazon-the-ashaninka-tribe-restored-their-19760299.php 403 Client Error: Forbidden for url: https://www.seattlepi.com/news/world/article/in-the-amazon-the-ashaninka-tribe-restored-their-19760299.php https://nation.africa/kenya/health/report-reveals-why-african-countries-face-persistent-water-woes-4776490 https://www.esquire.com/sports/a62215598/vince-lombardi-toughest-nfl-coach-1968/ https://press.un.org/en/2024/db241007.doc.htm https://itif.org/publications/2024/09/16/china-is-rapidly-becoming-a-leading-innovator-in-advanced-industries/ https://en.setopati.com/International/163665 404 Client Error: Not Found for url: https://en.setopati.com/International/163665%20 https://www.ipsnews.net/2024/09/cuban-town-improves-water-quality-desalination/ https://english.khabarhub.com/2024/18/397905/ https://www.ipsnews.net/2024/09/typhoon-yagi-devastates-southeast-asia/ https://www.ipsnews.net/2024/09/better-tomorrow-south-south-cooperation/ https://www.ipsnews.net/2024/10/rejuvenating-traditions-help-save-ancient-engineering-marvel-dhamapur-lake/
In [6]:
df = df.dropna(subset=['parsed_content'])
df = df.drop_duplicates(subset=['title'])
display(df['parsed_content'])
# Save raw data
df.to_csv('data/water+climate+change+river+drinking_news_202140908_20241008.csv', index=False)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[6], line 1 ----> 1 df = df.dropna(subset=['parsed_content']) 2 df = df.drop_duplicates(subset=['title']) 3 display(df['parsed_content']) NameError: name 'df' is not defined
Clean Data¶
In [7]:
# Download the list of English words and stopwords
nltk.download('words')
nltk.download('stopwords')
english_words = set(nltk.corpus.words.words())
stop_words = set(stopwords.words('english'))
# Load the data
df = pd.read_csv('data/water+climate+change+river+drinking_news_202140908_20241008.csv')
def clean_content(content):
if isinstance(content, list):
content = ' '.join(content)
# Convert to lowercase
content = content.lower()
# Replace 'climate change' with 'climatechange'
content = content.replace('climate change', 'climatechange')
# Remove numbers and dates
content = re.sub(r'\d+', '', content)
content = re.sub(r'\b\d{4}\b', '', content) # Remove 4-digit years
# Summarize the content
parser = PlaintextParser.from_string(content, Tokenizer("english"))
summarizer = LsaSummarizer()
summary = summarizer(parser.document, 5) # Summarize to 5 sentences
content = ' '.join([str(sentence) for sentence in summary])
# Initialize RAKE
rake = Rake()
# Extract keywords
rake.extract_keywords_from_text(content)
keywords = rake.get_ranked_phrases()
# Filter out non-English words and stopwords
keywords = [word for word in keywords if word in english_words and word not in stop_words]
# Remove duplicate words within a transaction
keywords = list(set(keywords))
return keywords
# Apply the clean_content function
df['transactions'] = df['parsed_content'].apply(ast.literal_eval)
df['cleaned_content'] = df['transactions'].apply(clean_content)
# Remove duplicates
df = df.drop_duplicates(subset=['cleaned_content'])
# Remove rows with empty cleaned_content
df = df[df['cleaned_content'].apply(lambda x: len(x) > 0)]
# Display the DataFrame
display(df[['parsed_content', 'transactions', 'cleaned_content']])
# Convert series of lists into a format suitable for R's read.transactions
def write_basket_format(df, filename):
with open(filename, 'w') as f:
for transaction in df['cleaned_content']:
f.write(','.join(transaction) + '\n')
# Write to a text file
write_basket_format(df, 'transactions_basket.txt')
[nltk_data] Downloading package words to [nltk_data] /Users/garrettflowers/nltk_data... [nltk_data] Package words is already up-to-date! [nltk_data] Downloading package stopwords to [nltk_data] /Users/garrettflowers/nltk_data... [nltk_data] Package stopwords is already up-to-date!
parsed_content | transactions | cleaned_content | |
---|---|---|---|
0 | ['Lauren', 'Sommer', 'Extreme', 'rain', 'is', ... | [Lauren, Sommer, Extreme, rain, is, becoming, ... | [known, works, warning, know, rainfall, floodi... |
2 | ['In', 'Pictures', 'It', 's', 'one', 'of', 'th... | [In, Pictures, It, s, one, of, the, largest, r... | [community, also, world, intense, end, country... |
3 | ['Italy', 's', 'Emilia', 'Romagna', 'region', ... | [Italy, s, Emilia, Romagna, region, sees, thre... | [aid, suspended, led, frequency, evacuation, c... |
4 | ['Yves', 'here', 'Yours', 'truly', 'has', 'bee... | [Yves, here, Yours, truly, has, been, under, r... | [pacific, glass, amount, return, prepared, cha... |
5 | ['GAMBOA', 'PANAMA', 'SEPTEMBER', '20', 'In', ... | [GAMBOA, PANAMA, SEPTEMBER, 20, In, an, aerial... | [causing, post, background, resulting, open, a... |
6 | ['Increasingly', 'severe', 'water', 'shortages... | [Increasingly, severe, water, shortages, repre... | [also, already, resolved, link, receive, syste... |
7 | ['Water', 'levels', 'in', 'many', 'of', 'the',... | [Water, levels, in, many, of, the, rivers, in,... | [anchored, agriculture, role, critical, likely... |
8 | ['Δ', 'A', 'nonprofit', 'independent', 'media'... | [Δ, A, nonprofit, independent, media, organiza... | [spread, addition, uranium, interview, return,... |
9 | ['Read', 'The', 'Diplomat', 'Know', 'The', 'As... | [Read, The, Diplomat, Know, The, Asia, Pacific... | [damage, goods, response, flooding, shelter, s... |
10 | ['Plus', 'Adam', 'Neumann', 's', 'climate', 'c... | [Plus, Adam, Neumann, s, climate, company, is,... | [post, open, end, go, work, event, purpose, ma... |
11 | ['Print', 'Severe', 'flooding', 'in', 'Chad', ... | [Print, Severe, flooding, in, Chad, since, Jul... | [summer, chad, slew, trouble, come, capital, d... |
12 | ['Print', 'The', 'death', 'toll', 'in', 'the',... | [Print, The, death, toll, in, the, aftermath, ... | [also, hand, aid, strategic, ground, flooding,... |
13 | ['Only', 'the', 'youngest', 'and', 'strongest'... | [Only, the, youngest, and, strongest, villager... | [goods, favorite, end, sale, country, rest, go... |
14 | ['Essential', 'news', 'coupled', 'with', 'the'... | [Essential, news, coupled, with, the, finest, ... | [rate, ocean, catastrophe, loss, arsenic, subm... |
15 | ['HENDERSONVILLE', 'N', 'C', 'AP', 'Hurricane'... | [HENDERSONVILLE, N, C, AP, Hurricane, Helene, ... | [community, reminder, topic, map, friend, floo... |
16 | ['The', 'completion', 'of', 'a', 'new', '59', ... | [The, completion, of, a, new, 59, million, res... | [works, landscape, critical, likely, project, ... |
17 | ['There', 'have', 'been', 'calls', 'for', 'the... | [There, have, been, calls, for, the, Governmen... | [works, also, spread, damage, spring, calling,... |
18 | ['Most', 'Widely', 'Read', 'Newspaper', 'An', ... | [Most, Widely, Read, Newspaper, An, aerial, vi... | [danger, also, solidarity, flooding, earn, cit... |
19 | ['Imperial', 'Oil', 'and', 'Sahtu', 'leaders',... | [Imperial, Oil, and, Sahtu, leaders, are, lock... | [review, likely, end, operation, many, go, lan... |
20 | ['Artificial', 'Intelligence', 'AI', 'AI', 'fo... | [Artificial, Intelligence, AI, AI, for, Everyo... | [interested, web, experience, money, finance, ... |
21 | ['Supplies', 'for', 'Phugtal', 'Monastery', 'b... | [Supplies, for, Phugtal, Monastery, by, Andrew... | [end, country, work, food, search, variety, sy... |
22 | ['In', 'the', 'gateway', 'to', 'the', 'Arctic'... | [In, the, gateway, to, the, Arctic, fat, and, ... | [policy, amount, battle, staple, end, rest, bo... |
23 | ['E', 'Edition', 'Sign', 'up', 'for', 'email',... | [E, Edition, Sign, up, for, email, newsletters... | [forest, open, likely, go, work, private, righ... |
24 | ['Most', 'Widely', 'Read', 'Newspaper', 'Steph... | [Most, Widely, Read, Newspaper, Stephen, Angbu... | [evening, grant, open, pray, hear, country, mi... |
25 | ['08', 'October', '2024', 'ePaper', '07', 'Oct... | [08, October, 2024, ePaper, 07, October, 2024,... | [damage, know, goods, flooding, shelter, sapa,... |
26 | ['Search', 'the', 'news', 'stories', 'people',... | [Search, the, news, stories, people, Personali... | [fact, work, familiar, provide, search, found,... |
27 | ['HOT', 'UPSC', 'IAS', 'ONLINE', 'COURSE', 'NE... | [HOT, UPSC, IAS, ONLINE, COURSE, NEW, UPSC, MA... | [procurement, spread, resolved, plunging, disc... |
28 | ['BELLVUE', 'Colo', 'AP', 'Camille', 'Stevens'... | [BELLVUE, Colo, AP, Camille, Stevens, Rumann, ... | [forest, open, likely, end, sale, country, go,... |
29 | ['In', 'What', 'It', 's', 'Like', 'people', 't... | [In, What, It, s, Like, people, tell, us, well... | [evening, based, amount, interview, hear, prep... |
30 | ['By', 'Zac', 'Ntim', 'International', 'Report... | [By, Zac, Ntim, International, Reporter, EXCLU... | [return, prep, appearance, fourth, make, wound... |
31 | ['Author', 's', 'note', 'Immigrant', 'implies'... | [Author, s, note, Immigrant, implies, a, forei... | [guardian, work, stretch, provide, make, found... |
32 | ['Fabiano', 'Maisonnave', 'Associated', 'Press... | [Fabiano, Maisonnave, Associated, Press, Fabia... | [landscape, world, intense, country, burned, n... |
33 | ['Valentine', 'PHX', 's', 'Elote', 'Pasta', 'I... | [Valentine, PHX, s, Elote, Pasta, It, s, Natio... | [nod, personal, anchored, union, post, based, ... |
34 | ['08', 'October', '2024', 'ePaper', '07', 'Oct... | [08, October, 2024, ePaper, 07, October, 2024,... | [spread, post, leave, gear, amount, know, resp... |
35 | ['Search', 'and', 'rescue', 'operations', 'con... | [Search, and, rescue, operations, continued, i... | [headline, also, spread, post, drainage, gear,... |
36 | ['September', '24', '2024', '10', '24', 'ET', ... | [September, 24, 2024, 10, 24, ET, Source, Exac... | [alkalinity, addition, gaining, food, found, s... |
37 | ['PLOS', 'Neglected', 'Tropical', 'Diseases', ... | [PLOS, Neglected, Tropical, Diseases, is, the,... | [compare, based, topic, addition, calculated, ... |
38 | ['October', '03', '2024', '08', '05', 'ET', 'S... | [October, 03, 2024, 08, 05, ET, Source, Beyond... | [community, social, passion, book, fostering, ... |
39 | ['Article', 'Text', 'PDF', 'PDF', 'Supplementa... | [Article, Text, PDF, PDF, Supplementary, Mater... | [translation, reverse, escape, available, opin... |
40 | ['Seth', 'Borenstein', 'Associated', 'Press', ... | [Seth, Borenstein, Associated, Press, Seth, Bo... | [policy, amount, battle, staple, end, rest, bo... |
41 | ['Michael', 'Regan', 'seemed', 'like', 'he', '... | [Michael, Regan, seemed, like, he, was, spoili... | [strident, possibility, policy, battle, ground... |
42 | ['As', 'the', 'North', 'grapples', 'with', 'fl... | [As, the, North, grapples, with, floods, famil... | [addition, country, go, provide, bought, face,... |
43 | ['We', 'haven', 't', 'seen', 'a', 'disaster', ... | [We, haven, t, seen, a, disaster, like, this, ... | [warning, diameter, return, ground, harder, el... |
44 | ['Alexis', 'Christoforous', 'has', 'the', 'lat... | [Alexis, Christoforous, has, the, latest, on, ... | [earthquake, end, country, operation, go, atta... |
45 | ['PLOS', 'Neglected', 'Tropical', 'Diseases', ... | [PLOS, Neglected, Tropical, Diseases, is, the,... | [possibility, spread, reproduction, available,... |
46 | ['PLOS', 'Neglected', 'Tropical', 'Diseases', ... | [PLOS, Neglected, Tropical, Diseases, is, the,... | [assistance, grant, addition, investigation, c... |
47 | ['Insight', 'and', 'inspiration', 'in', 'turbu... | [Insight, and, inspiration, in, turbulent, tim... | [open, map, river, commerce, respond, guard, s... |
48 | ['Hello', 'Your', 'subscription', 'is', 'almos... | [Hello, Your, subscription, is, almost, coming... | [likely, continent, end, demise, rest, country... |
49 | ['If', 'you', 'weren', 't', 'fired', 'up', 'to... | [If, you, weren, t, fired, up, to, play, for, ... | [ground, digs, mouth, custom, escape, availabl... |
50 | ['The', 'following', 'is', 'a', 'near', 'verba... | [The, following, is, a, near, verbatim, transc... | [refer, vacuum, unbalanced, fact, phone, likel... |
51 | ['There', 'may', 'be', 'no', 'more', 'importan... | [There, may, be, no, more, important, question... | [erode, policy, topic, strategy, exceeding, av... |
52 | ['Tuesday', 'October', '8', '2024', 'Two', 'pe... | [Tuesday, October, 8, 2024, Two, people, colle... | [addition, quench, interview, country, membran... |
53 | ['18', 'September', '2024', 'Time', 'taken', '... | [18, September, 2024, Time, taken, to, read, 1... | [forest, policy, topic, grazing, earthquake, u... |
54 | ['Tuesday', 'October', '8', '2024', 'The', 'af... | [Tuesday, October, 8, 2024, The, aftermath, of... | [agriculture, damage, addition, led, critical,... |
55 | ['Tuesday', 'October', '8', '2024', 'Credit', ... | [Tuesday, October, 8, 2024, Credit, United, Na... | [satisfactory, solidarity, enhance, crucial, a... |
56 | ['Tuesday', 'October', '8', '2024', 'The', 'Vi... | [Tuesday, October, 8, 2024, The, Vijayanagar, ... | [fact, reverse, encroachment, flora, found, pl... |
In [8]:
# Display transaction top 5 lines
with open('transactions_basket.txt', 'r') as f:
for i, line in enumerate(f):
if i < 5:
print(line.strip())
else:
break
known,works,warning,know,rainfall,flooding,country,rest,reflect,hard,ground,soaking,turn,becoming,happening,cut,city,make,river,runoff,rebuild,live,difference,u,mud,professor,adequate,sign,risk,cutting,isolated,effort,eliminate,able,infrastructure,fast,think,designed,year,deadly,severity,atlas,trend,imagine,problem,situation,making,today,set,streets,hurricane,may,built,rain,hold,depend,category,disaster,chance,better,future,plan,longer,association,people,safety,much,continue,going,safe,handle community,also,world,intense,end,country,right,spark,people,researcher,percent,one,measurement,paper,bathing,around,shrunk,tumble,statement,brazil,fallen,covered,told,transportation,port,water,jump,head,environmental,stress,weekend,north,drought,suffering,common,way,effects,everything,depth,struggling,much,revenue aid,suspended,led,frequency,evacuation,country,contribution,electricity,person,disposed,upriver,city,place,intensity,money,met,mayor,mud,wave,increasing,fallen,said,infrastructure,five,help,well,closed,necessary,southwest,housing,carried,scale,water,governor,saying,way,people,rise,town pacific,glass,amount,return,prepared,charge,go,theory,many,prerequisite,provide,make,found,place,large,attack,variety,physics,sealed,understanding,overuse,research,nearly,filled,merely,five,revive,lake,tiny,contact,told,declare,water,reduce,company,form,answer,emergency,combination,area,solution,add,possible,reach,continue,stave,basin,leach,rainfall,none,suppose,stud,department,experience,struck,coming,believe,starting,say,links,u,effort,university,series,lot,sides,internationally,year,application,severity,gag,exposed,subject,example,volume,reason,want,potential,instance,often,dip,partnership,cornerstone,land,early,panel,much,runoff,able,news,already,cadmium,metal,dry,world,month,least,unfolding,proceed,apply,would,evaporation,one,new,result,city,used,method,aluminium,half,worded,responsible,drastically,usual,plenty,lookout,running,top,average,remember,recipe,making,summer,industry,rain,even,winter,record,plastic,better,meant,store,climate,pineapple,seen,ask,flat,cause,know,led,grow,storage,reservoir,failure,live,could,mayor,use,come,lost,aware,defense,sold,energy,left,irrigate,fed,always,truly,precipitation,course,size,time,like,done,number,chile,globe,roof,issue,call,sewage,alone,made,without,deal causing,post,background,resulting,open,amount,country,operation,provide,river,happen,feed,scientist,travel,panama,office,water,institute,soil,engaged,part,site,service,crept,combined,reading,crane,contribute,community,rainfall,creation,talk,growth,system,believe,canoe,week,order,modify,series,second,want,hike,monitor,drought,drop,ship,focus,civil,steal,much,rate,world,one,expand,result,filling,notice,money,waiting,pass,note,millions,necessary,well,blocked,quest,average,wait,integrity,canopy,die,importance,environmental,canal,history,colon,ton,restrict,release,authority,carry,watched,shrouded,transit,cargo,check,movement,diversity,economy,atlantic,completion,cost,region,accommodate,designed,decrease,done,share,mountainous,number,depend,protecting,challenge,lack,communication
ARM¶
In [1]:
# Load necessary libraries
library(arules)
library(arulesViz)
# Read transactions
data <- read.transactions("transactions_basket.txt", format = "basket", sep = ",")
# Generate frequent itemsets
frequent_itemsets <- apriori(data, parameter = list(supp = 0.2, target = "frequent itemsets"))
# Generate association rules
rules <- apriori(data, parameter = list(supp = 0.2, conf = 0.8, target = "rules"))
# Check if any rules were generated
if (length(rules) > 0) {
# Print top 15 rules by support
top_support <- sort(rules, by = "support", decreasing = TRUE)[1:15]
cat("Top 15 rules by support:\n")
inspect(top_support)
# Print top 15 rules by confidence
top_confidence <- sort(rules, by = "confidence", decreasing = TRUE)[1:15]
cat("\nTop 15 rules by confidence:\n")
inspect(top_confidence)
# Print top 15 rules by lift
top_lift <- sort(rules, by = "lift", decreasing = TRUE)[1:15]
cat("\nTop 15 rules by lift:\n")
inspect(top_lift)
# Plot the rules
plot(rules, method = "graph", control = list(type = "items"))
} else {
print("No association rules were generated.")
}
Warning message: "package 'arules' was built under R version 4.3.3" Loading required package: Matrix Attaching package: 'arules' The following objects are masked from 'package:base': abbreviate, write
Apriori Parameter specification: confidence minval smax arem aval originalSupport maxtime support minlen NA 0.1 1 none FALSE TRUE 5 0.2 1 maxlen target ext 10 frequent itemsets TRUE Algorithmic control: filter tree heap memopt load sort verbose 0.1 TRUE TRUE FALSE TRUE 2 TRUE Absolute minimum support count: 11 set item appearances ...[0 item(s)] done [0.00s]. set transactions ...[3627 item(s), 56 transaction(s)] done [0.00s]. sorting and recoding items ... [67 item(s)] done [0.00s]. creating transaction tree ... done [0.00s]. checking subsets of size 1 2 3 4 done [0.00s]. sorting transactions ... done [0.00s]. writing ... [292 set(s)] done [0.00s]. creating S4 object ... done [0.00s]. Apriori Parameter specification: confidence minval smax arem aval originalSupport maxtime support minlen 0.8 0.1 1 none FALSE TRUE 5 0.2 1 maxlen target ext 10 rules TRUE Algorithmic control: filter tree heap memopt load sort verbose 0.1 TRUE TRUE FALSE TRUE 2 TRUE Absolute minimum support count: 11 set item appearances ...[0 item(s)] done [0.00s]. set transactions ...[3627 item(s), 56 transaction(s)] done [0.00s]. sorting and recoding items ... [67 item(s)] done [0.00s]. creating transaction tree ... done [0.00s]. checking subsets of size 1 2 3 4 done [0.00s]. writing ... [144 rule(s)] done [0.00s]. creating S4 object ... done [0.00s]. Top 15 rules by support: lhs rhs support confidence coverage lift count [1] {said} => {people} 0.3750000 0.8750000 0.4285714 1.361111 21 [2] {get} => {water} 0.3035714 1.0000000 0.3035714 1.696970 17 [3] {time} => {water} 0.3035714 0.8500000 0.3571429 1.442424 17 [4] {would} => {year} 0.2857143 0.8421053 0.3392857 1.684211 16 [5] {would} => {water} 0.2857143 0.8421053 0.3392857 1.429027 16 [6] {make} => {way} 0.2857143 0.8421053 0.3392857 1.964912 16 [7] {used} => {year} 0.2857143 0.8000000 0.3571429 1.600000 16 [8] {used} => {water} 0.2857143 0.8000000 0.3571429 1.357576 16 [9] {time} => {people} 0.2857143 0.8000000 0.3571429 1.244444 16 [10] {go} => {year} 0.2678571 1.0000000 0.2678571 2.000000 15 [11] {think} => {year} 0.2678571 0.9375000 0.2857143 1.875000 15 [12] {able} => {water} 0.2678571 0.8333333 0.3214286 1.414141 15 [13] {lot} => {year} 0.2500000 0.8750000 0.2857143 1.750000 14 [14] {far} => {people} 0.2500000 1.0000000 0.2500000 1.555556 14 [15] {go} => {water} 0.2500000 0.9333333 0.2678571 1.583838 14 Top 15 rules by confidence: lhs rhs support confidence coverage lift count [1] {far} => {people} 0.2500000 1 0.2500000 1.555556 14 [2] {go} => {year} 0.2678571 1 0.2678571 2.000000 15 [3] {want} => {water} 0.2500000 1 0.2500000 1.696970 14 [4] {get} => {water} 0.3035714 1 0.3035714 1.696970 17 [5] {lot, would} => {year} 0.2142857 1 0.2142857 2.000000 12 [6] {lot, used} => {year} 0.2142857 1 0.2142857 2.000000 12 [7] {get, go} => {year} 0.2142857 1 0.2142857 2.000000 12 [8] {get, go} => {water} 0.2142857 1 0.2142857 1.696970 12 [9] {go, would} => {year} 0.2142857 1 0.2142857 2.000000 12 [10] {go, would} => {water} 0.2142857 1 0.2142857 1.696970 12 [11] {go, people} => {year} 0.2142857 1 0.2142857 2.000000 12 [12] {go, water} => {year} 0.2500000 1 0.2500000 2.000000 14 [13] {going, think} => {year} 0.2321429 1 0.2321429 2.000000 13 [14] {going, way} => {know} 0.2142857 1 0.2142857 2.800000 12 [15] {want, year} => {water} 0.2142857 1 0.2142857 1.696970 12 Top 15 rules by lift: lhs rhs support confidence coverage lift [1] {know, think, year} => {going} 0.2142857 1.0000000 0.2142857 3.733333 [2] {going, know, year} => {think} 0.2142857 1.0000000 0.2142857 3.500000 [3] {get, year} => {go} 0.2142857 0.9230769 0.2321429 3.446154 [4] {know, think} => {going} 0.2142857 0.9230769 0.2321429 3.446154 [5] {get, water, year} => {go} 0.2142857 0.9230769 0.2321429 3.446154 [6] {going, year} => {think} 0.2321429 0.9285714 0.2500000 3.250000 [7] {think, year} => {going} 0.2321429 0.8666667 0.2678571 3.235556 [8] {going, know} => {think} 0.2142857 0.9230769 0.2321429 3.230769 [9] {know, year} => {going} 0.2142857 0.8571429 0.2500000 3.200000 [10] {water, would, year} => {go} 0.2142857 0.8571429 0.2500000 3.200000 [11] {going} => {think} 0.2321429 0.8666667 0.2678571 3.033333 [12] {think} => {going} 0.2321429 0.8125000 0.2857143 3.033333 [13] {know, year} => {think} 0.2142857 0.8571429 0.2500000 3.000000 [14] {way, year} => {think} 0.2142857 0.8571429 0.2500000 3.000000 [15] {know, way} => {going} 0.2142857 0.8000000 0.2678571 2.986667 count [1] 12 [2] 12 [3] 12 [4] 12 [5] 12 [6] 13 [7] 13 [8] 12 [9] 12 [10] 12 [11] 13 [12] 13 [13] 12 [14] 12 [15] 12
Warning message: "Unknown control parameters: type"
Available control parameters (with default values): layout = stress circular = FALSE ggraphdots = NULL edges = <environment> nodes = <environment> nodetext = <environment> colors = c("#EE0000FF", "#EEEEEEFF") engine = ggplot2 max = 100 verbose = FALSE
Warning message: "Too many rules supplied. Only plotting the best 100 using 'lift' (change control parameter max if needed)."