You are on page 1of 5

/* This Project focuses on preparing and cleaning a Nashville housing dataset using SQL.

*/

SELECT*
FROM Practice.dbo.NashvilleHousing

/*Part 1 - Standardize SaleDate Format */

SELECT SaleDate, CONVERT(Date, SaleDate) As SaleDateStandardized


FROM Practice.dbo.NashvilleHousing

-- Add a new column


ALTER TABLE Practice.dbo.NashvilleHousing
ADD SaleDateConverted DATE;

UPDATE Practice.dbo.NashvilleHousing
SET SaleDateConverted = CONVERT(DATE,SaleDate);

SELECT SaleDateConverted
FROM Practice.dbo.NashvilleHousing

/*Part 2 - Populate Property Address Data. */

SELECT PropertyAddress
FROM Practice.dbo.NashvilleHousing
WHERE PropertyAddress IS NULL

/* checking to see if we can populate NULL values through parcel ID


Performing a Self-join using alias to compare values between two instances in the same
table
Using UniqueID to filter out rows where UniqueID is the same between the two instances
(basically getting only distinc rows in results.
Using ISNULL to ensure that Property Address takes value from either alias a or b, to
get rid off NULL values
Lastly use UPDATE to effectly change the table
*/

SELECT*
FROM Practice.dbo.NashvilleHousing
ORDER BY ParcelID
SELECT a.ParcelID, a.PropertyAddress, b.ParcelID, b.PropertyAddress,
ISNULL(a.PropertyAddress, b.PropertyAddress)
FROM Practice.dbo.NashvilleHousing AS a
JOIN Practice.dbo.NashvilleHousing AS b
on a.ParcelID = b.ParcelID
AND a.[UniqueID] <> b.[UniqueID]
WHERE a.PropertyAddress IS NULL

UPDATE a
SET PropertyAddress = ISNULL(a.PropertyAddress, b.PropertyAddress)
FROM Practice.dbo.NashvilleHousing AS a
JOIN Practice.dbo.NashvilleHousing AS b
on a.ParcelID = b.ParcelID
AND a.[UniqueID] <> b.[UniqueID]
WHERE a.PropertyAddress IS NULL

/*Part 3 - Separating Address attributes into individual Collumns (Address, City, State) */

SELECT PropertyAddress
FROM Practice.dbo.NashvilleHousing

SELECT SUBSTRING(PropertyAddress, 1, CHARINDEX(',', PropertyAddress)-1) AS Address,


SUBSTRING(PropertyAddress, CHARINDEX(',', PropertyAddress)+1,
LEN(PropertyAddress)) AS Address
FROM Practice.dbo.NashvilleHousing

/* 1st Substring starts from position 1 all the way to ",", basically specify that I want everything
until the comma but not the comma 8(thus the -1)
2nd Substring starts from the "," all the way to the end, but because address may have more
or less characters, LEN(PropertyAddress) is used to fix that.
The "+1" also indicate to start after the ",".
*/

-- Adding the 2 new columns to the table by using UPDATE

ALTER TABLE Practice.dbo.NashvilleHousing


ADD PropertySplitAddress NVARCHAR(255);

UPDATE Practice.dbo.NashvilleHousing
SET PropertySplitAddress = SUBSTRING(PropertyAddress, 1, CHARINDEX(',',
PropertyAddress)-1);

ALTER TABLE Practice.dbo.NashvilleHousing


ADD PropertySplitCity NVARCHAR(255);

UPDATE Practice.dbo.NashvilleHousing
SET PropertySplitCity = SUBSTRING(PropertyAddress, CHARINDEX(',', PropertyAddress)+1,
LEN(PropertyAddress));

SELECT*
FROM Practice.dbo.NashvilleHousing

/--Side note: Parsing could also be done on this instance and the alter and update like before

Select
PARSENAME(REPLACE(OwnerAddress,',','.'),3) AS "Address",
PARSENAME(REPLACE(OwnerAddress,',','.'),2) AS "City",
PARSENAME(REPLACE(OwnerAddress,',','.'),1) AS "State"
FROM Practice.dbo.NashvilleHousing

/--Change column SoldAsVacant from "Y" to "Yes" and "N" to "No"

SELECT DISTINCT(SoldAsVacant),COUNT(SoldAsVacant) AS "COUNT"


FROM Practice.dbo.NashvilleHousing
GROUP BY SoldAsVacant
Order BY 2

SELECT SoldAsVacant,
CASE WHEN SoldAsVacant = 'Y' THEN 'Yes' WHEN SoldAsVacant = 'N' THEN 'No'
ElSE SoldAsVacant END
FROM Practice.dbo.NashvilleHousing

UPDATE Practice.dbo.NashvilleHousing
SET SoldAsVacant = CASE WHEN SoldAsVacant = 'Y' THEN 'Yes' WHEN SoldAsVacant = 'N'
THEN 'No' ElSE SoldAsVacant END

/*Remove Duplicates for the whole spreadsheet (in this case I will
actually delete the data without creating a
backup since this is just a training set)*/
-- The CTE common table expression is going to be used to define the query block, so it can be
used again

/* This allows me to check the duplicates

WITH RowNumCTE AS(


SELECT*,
ROW_NUMBER()OVER(
PARTITION BY ParcelID,
PropertyAddress,
SalePrice,
SaleDate,
LegalReference
ORDER BY
UniqueID) row_num
FROM Practice.dbo.NashvilleHousing
)
SELECT *
FROM RowNumCTE
WHERE row_num > 1
ORDER BY PropertyAddress
*/

-- Deleting duplicates

WITH RowNumCTE AS(


SELECT*,
ROW_NUMBER()OVER(
PARTITION BY ParcelID,
PropertyAddress,
SalePrice,
SaleDate,
LegalReference
ORDER BY
UniqueID) row_num
FROM Practice.dbo.NashvilleHousing
)
DELETE
FROM RowNumCTE
WHERE row_num > 1

-- Delete unused Columns (again this is a dataset used for training so its ok to delete)

SELECT*
FROM Practice.dbo.NashvilleHousing

ALTER TABLE Practice.dbo.NashvilleHousing


DROP COLUMN OwnerAddress, TaxDistrict, PropertyAddress, SaleDate

You might also like