list of columns : customer's first and last name customer's adress city / location of the store quantity price code_product category of products invoice_id payment method inoice_date
pip install Faker
Collecting Faker
Downloading Faker-18.13.0-py3-none-any.whl (1.7 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.7/1.7 MB 9.5 MB/s eta 0:00:00:00:0100:01
Requirement already satisfied: python-dateutil>=2.4 in /srv/conda/envs/notebook/lib/python3.7/site-packages (from Faker) (2.8.2)
Requirement already satisfied: typing-extensions>=3.10.0.1 in /srv/conda/envs/notebook/lib/python3.7/site-packages (from Faker) (4.0.1)
Requirement already satisfied: six>=1.5 in /srv/conda/envs/notebook/lib/python3.7/site-packages (from python-dateutil>=2.4->Faker) (1.16.0)
Installing collected packages: Faker
Successfully installed Faker-18.13.0
Note: you may need to restart the kernel to use updated packages.
create fake names
import faker
from faker import Faker
fake = Faker()
fake.name()
set_of_names= []
for x in range(101):
name = fake.name()
set_of_names.append(name)
len(set_of_names)
101
create two different lists : first_name last_name
first_name = []
last_name = []
for name in set_of_names :
splited_name = name.split(" ")
first_name.append(splited_name[0])
last_name.append(splited_name[1])
create fake customer's adress
fake = Faker('fr_FR')
set_of_adress = set()
for _ in range(101):
adress = fake.address()
set_of_adress.add(adress)
create a list of city stores
import random
random.seed(123)
cities = ["paris", "lyon", "marseille", "nice", "toulouse"]
list_of_cities = random.choices(cities, k= 603)
len(list_of_cities)
603
customer's id
customer_id = []
for x in range (101):
customer_id.append(str(x)+""+"ABC")
quantity
list_of_quantity =[]
random.seed(123)
for x in range(603):
qty = random.randint(1,11)
list_of_quantity.append(qty)
len(list_of_quantity)
603
amount
list_of_price =[]
random.seed(123)
for x in range(201):
price= random.randint(5,300)
list_of_price.append(price)
categories
categories = [
"Electronics & Gadgets",
"Home & Kitchen",
"Health & Beauty",
"Clothing & Accessories",
"Toys & Games"
]
categories_code = [
"EG",
"HK",
"HB",
"CA",
"TG"
]
id_product
numbers = []
for x in range(603):
numbers.append(x)
for num in numbers :
# Convert to a list of strings
string_list = [str(num) for num in numbers]
50 product in every category
code_product= []
for code in categories_code:
for num in string_list:
code_product.append(code+" "+num)
import random
random.shuffle(code_product)
new = random.choices (code_product, k= 201)
len(new)
201
id product for product table
numbers2 = []
for y in range(201):
numbers2.append(y)
for num in numbers2 :
# Convert to a list of strings
string_list2 = [str(num) for num in numbers2]
product_id2= []
for code in categories_code:
for num2 in string_list2:
product_id2.append(code+" "+num2)
len(product_id2)
1005
code_pdt_cat = []
for x in product_id2 :
splited_id_pdt = x.split(" ")
code_pdt_cat.append( splited_id_pdt[0])
len(code_pdt_cat)
1005
invoice date
from faker import Faker
import datetime
fake = Faker()
random_date = []
#using string formats for dates that Faker cannot interpret correctly.
#The Faker library expects date inputs to be either datetime.date objects or formatted correctly.
start_date = datetime.date(2020, 1, 1)
end_date = datetime.date(2024, 12, 31)
for _ in range(1001):
random_date_between = fake.date_between(start_date=start_date, end_date=end_date)
# Convert the date to a string in 'YYYY-MM-DD' format
random_date.append(random_date_between.strftime('%Y-%m-%d'))
invoice_id
id_invoice= []
for x in range(201) :
id_invoice.append("F"+"0"+str(x))
create five tables and assign values
#customer
#categories
#invoice
#invoice_item
#product
#create data frame for customer
import pandas as pd
index_names = []
for i in range(101):
index_names.append(i)
customer = pd.DataFrame(columns=['First Name', 'Last Name', 'address','c_id'], index = index_names)
customer['First Name'] = first_name
customer['Last Name'] = last_name
customer['address'] = list(set_of_adress)
customer['c_id']= customer_id
customer
| First Name | Last Name | address | c_id | |
|---|---|---|---|---|
| 0 | Carolyn | Washington | 58, rue de Olivier\n30416 Legrand | 0ABC |
| 1 | Cassandra | Herman | 2, boulevard Maurice Ruiz\n58945 Bergernec | 1ABC |
| 2 | Michael | Conway | 5, avenue Monique Vidal\n61494 Sainte Dianedan | 2ABC |
| 3 | Angela | Sloan | 934, rue de Neveu\n97237 Le Gallnec | 3ABC |
| 4 | Michael | Strickland | 197, chemin Gomez\n40186 Baron-sur-Mer | 4ABC |
| ... | ... | ... | ... | ... |
| 96 | Seth | Perkins | 60, rue de Leleu\n75174 Sainte AgatheVille | 96ABC |
| 97 | Margaret | Boone | 77, rue de Philippe\n63291 Martineau | 97ABC |
| 98 | Dawn | Durham | 97, rue de Petitjean\n68378 Sainte JulienVille | 98ABC |
| 99 | Haley | Rodriguez | 87, rue Albert\n01783 Gosselin-sur-Grondin | 99ABC |
| 100 | Jessica | Jackson | 948, boulevard de Blanchet\n75260 Reynaud-sur-... | 100ABC |
101 rows × 4 columns
len(categories_code)
5
#create data frame for categories
index_names = []
for i in range(5):
index_names.append(i)
categories = pd.DataFrame(columns=['id_category' , 'the category'],index = index_names)
categories['id_category'] = categories_code
categories['the category']= [
"Electronics & Gadgets",
"Home & Kitchen",
"Health & Beauty",
"Clothing & Accessories",
"Toys & Games"
]
categories
| id_category | the category | |
|---|---|---|
| 0 | EG | Electronics & Gadgets |
| 1 | HK | Home & Kitchen |
| 2 | HB | Health & Beauty |
| 3 | CA | Clothing & Accessories |
| 4 | TG | Toys & Games |
rep_customer_id = random.choices(customer_id , k = 201)
rep_dates = random.choices(random_date, k = 201)
#create data frame for invoice
index_names = []
for i in range(201):
index_names.append(i)
invoice = pd.DataFrame(columns=['invoice_id','customer_id', 'invoice_date', 'amount' ],index = index_names)
invoice['invoice_id'] = id_invoice
invoice['invoice_date']= rep_dates
invoice['customer_id']= rep_customer_id
invoice['amount']= list_of_price
invoice
| invoice_id | customer_id | invoice_date | amount | |
|---|---|---|---|---|
| 0 | F00 | 2ABC | 2024-05-08 | 31 |
| 1 | F01 | 43ABC | 2020-07-03 | 142 |
| 2 | F02 | 94ABC | 2023-12-01 | 49 |
| 3 | F03 | 23ABC | 2024-05-01 | 213 |
| 4 | F04 | 70ABC | 2023-12-20 | 141 |
| ... | ... | ... | ... | ... |
| 196 | F0196 | 52ABC | 2023-06-29 | 142 |
| 197 | F0197 | 85ABC | 2023-08-04 | 294 |
| 198 | F0198 | 82ABC | 2020-11-11 | 123 |
| 199 | F0199 | 57ABC | 2023-05-27 | 48 |
| 200 | F0200 | 64ABC | 2023-10-02 | 167 |
201 rows × 4 columns
index_names = []
for i in range(1005):
index_names.append(i)
product = pd.DataFrame(columns=[ 'category_product', 'pdt_id'], index= index_names)
product ['pdt_id'] = product_id2
product ['category_product'] =code_pdt_cat
product
| category_product | pdt_id | |
|---|---|---|
| 0 | EG | EG 0 |
| 1 | EG | EG 1 |
| 2 | EG | EG 2 |
| 3 | EG | EG 3 |
| 4 | EG | EG 4 |
| ... | ... | ... |
| 1000 | TG | TG 196 |
| 1001 | TG | TG 197 |
| 1002 | TG | TG 198 |
| 1003 | TG | TG 199 |
| 1004 | TG | TG 200 |
1005 rows × 2 columns
rep_invoice_id = random.choices(id_invoice , k = 603)
rep_price_id = random.choices(new , k = 603)
#invoice_item_id sql
index_names = []
for i in range(603):
index_names.append(i)
invoice_item = pd.DataFrame(columns=[ 'invoice_id' , 'qty', 'product_id', 'city' ], index= index_names)
invoice_item ['invoice_id']= rep_invoice_id
invoice_item ['qty']= list_of_quantity
invoice_item ['product_id']= rep_price_id
invoice_item ['city']= list_of_cities
invoice_item
| invoice_id | qty | product_id | city | |
|---|---|---|---|---|
| 0 | F087 | 1 | HB 596 | paris |
| 1 | F0104 | 5 | EG 489 | paris |
| 2 | F052 | 2 | HK 441 | marseille |
| 3 | F041 | 7 | HK 230 | paris |
| 4 | F0130 | 5 | CA 235 | toulouse |
| ... | ... | ... | ... | ... |
| 598 | F015 | 2 | CA 269 | marseille |
| 599 | F073 | 11 | HK 528 | paris |
| 600 | F0167 | 9 | EG 157 | paris |
| 601 | F00 | 2 | TG 358 | lyon |
| 602 | F0194 | 6 | TG 435 | nice |
603 rows × 4 columns
#transform files to csv
customer.to_csv('customers.csv', index=False)
invoice.to_csv('invoice.csv', index=False)
invoice_item.to_csv("invoice_item.csv",index=False)
categories.to_csv("category.csv",index=False)
product.to_csv("product.csv",index=False)