list of columns : customer's first and last name customer's adress city / location of the store quantity price code_product category of products invoice_id payment method inoice_date

In [4]:
pip install Faker 
Collecting Faker
  Downloading Faker-18.13.0-py3-none-any.whl (1.7 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.7/1.7 MB 9.5 MB/s eta 0:00:00:00:0100:01
Requirement already satisfied: python-dateutil>=2.4 in /srv/conda/envs/notebook/lib/python3.7/site-packages (from Faker) (2.8.2)
Requirement already satisfied: typing-extensions>=3.10.0.1 in /srv/conda/envs/notebook/lib/python3.7/site-packages (from Faker) (4.0.1)
Requirement already satisfied: six>=1.5 in /srv/conda/envs/notebook/lib/python3.7/site-packages (from python-dateutil>=2.4->Faker) (1.16.0)
Installing collected packages: Faker
Successfully installed Faker-18.13.0
Note: you may need to restart the kernel to use updated packages.

create fake names

In [5]:
import faker
from faker import Faker
fake = Faker()

fake.name()
set_of_names= []
for x in range(101):
    name = fake.name()
    set_of_names.append(name)
len(set_of_names)
Out[5]:
101

create two different lists : first_name last_name

In [6]:
first_name = []
last_name = []
for name in set_of_names : 
    splited_name = name.split(" ")
    first_name.append(splited_name[0])
    last_name.append(splited_name[1])

create fake customer's adress

In [7]:
fake = Faker('fr_FR')
set_of_adress = set()
for _ in range(101):
    adress = fake.address()
    set_of_adress.add(adress)

    

create a list of city stores

In [8]:
import random
random.seed(123)
cities = ["paris", "lyon", "marseille", "nice", "toulouse"]
list_of_cities = random.choices(cities, k= 603)
len(list_of_cities)
Out[8]:
603

customer's id

In [9]:
customer_id = []
for x in range (101):
    customer_id.append(str(x)+""+"ABC")

quantity

In [10]:
list_of_quantity =[]
random.seed(123)
for x in range(603): 
    qty = random.randint(1,11)
    list_of_quantity.append(qty)
len(list_of_quantity)
    
Out[10]:
603

amount

In [11]:
list_of_price =[]
random.seed(123)
for x in range(201): 
    price= random.randint(5,300)
    list_of_price.append(price)

categories

In [41]:
categories = [
    "Electronics & Gadgets",
    "Home & Kitchen",
    "Health & Beauty",
    "Clothing & Accessories",
    "Toys & Games"
]

categories_code = [
    "EG",
    "HK",
    "HB",
    "CA",
    "TG"
]

id_product

In [14]:
numbers = []

for x in range(603): 
    numbers.append(x)
    

for num in numbers :
# Convert to a list of strings
    string_list = [str(num) for num in numbers]

50 product in every category

In [15]:
code_product= []
for code in categories_code:
    for num in string_list:
        code_product.append(code+" "+num)

import random 
random.shuffle(code_product)

new = random.choices (code_product, k= 201)
len(new)
Out[15]:
201

id product for product table

In [16]:
numbers2 = []

for y in range(201): 
    numbers2.append(y)
    

for num in numbers2 :
# Convert to a list of strings
    string_list2 = [str(num) for num in numbers2]

product_id2= []
for code in categories_code:
    for num2 in string_list2:
        product_id2.append(code+" "+num2)   
len(product_id2)
Out[16]:
1005
In [17]:
code_pdt_cat = []

for x in product_id2 : 
    splited_id_pdt = x.split(" ") 
    code_pdt_cat.append( splited_id_pdt[0])
len(code_pdt_cat)
Out[17]:
1005

invoice date

In [18]:
from faker import Faker
import datetime

fake = Faker()
random_date = []

#using string formats for dates that Faker cannot interpret correctly. 
#The Faker library expects date inputs to be either datetime.date objects or formatted correctly.

start_date = datetime.date(2020, 1, 1)
end_date = datetime.date(2024, 12, 31)

for _ in range(1001):
    random_date_between = fake.date_between(start_date=start_date, end_date=end_date)
    # Convert the date to a string in 'YYYY-MM-DD' format
    random_date.append(random_date_between.strftime('%Y-%m-%d'))

  

invoice_id

In [19]:
id_invoice= []
for x in range(201) :
    id_invoice.append("F"+"0"+str(x))

create five tables and assign values

In [ ]:
#customer 
#categories
#invoice 
#invoice_item 
#product
In [20]:
#create data frame for customer 
import pandas as pd 
index_names = []
for i in range(101):
    index_names.append(i)
customer = pd.DataFrame(columns=['First Name', 'Last Name', 'address','c_id'], index = index_names)





customer['First Name'] = first_name 
customer['Last Name'] = last_name
customer['address'] = list(set_of_adress)
customer['c_id']= customer_id
In [21]:
customer
Out[21]:
First Name Last Name address c_id
0 Carolyn Washington 58, rue de Olivier\n30416 Legrand 0ABC
1 Cassandra Herman 2, boulevard Maurice Ruiz\n58945 Bergernec 1ABC
2 Michael Conway 5, avenue Monique Vidal\n61494 Sainte Dianedan 2ABC
3 Angela Sloan 934, rue de Neveu\n97237 Le Gallnec 3ABC
4 Michael Strickland 197, chemin Gomez\n40186 Baron-sur-Mer 4ABC
... ... ... ... ...
96 Seth Perkins 60, rue de Leleu\n75174 Sainte AgatheVille 96ABC
97 Margaret Boone 77, rue de Philippe\n63291 Martineau 97ABC
98 Dawn Durham 97, rue de Petitjean\n68378 Sainte JulienVille 98ABC
99 Haley Rodriguez 87, rue Albert\n01783 Gosselin-sur-Grondin 99ABC
100 Jessica Jackson 948, boulevard de Blanchet\n75260 Reynaud-sur-... 100ABC

101 rows × 4 columns

In [28]:
len(categories_code)
Out[28]:
5
In [51]:
#create data frame for categories
index_names = []
for i in range(5):
    index_names.append(i)

categories = pd.DataFrame(columns=['id_category' , 'the category'],index = index_names)


categories['id_category'] = categories_code

categories['the category']= [
    "Electronics & Gadgets",
    "Home & Kitchen",
    "Health & Beauty",
    "Clothing & Accessories",
    "Toys & Games"
]
In [52]:
categories
Out[52]:
id_category the category
0 EG Electronics & Gadgets
1 HK Home & Kitchen
2 HB Health & Beauty
3 CA Clothing & Accessories
4 TG Toys & Games
In [56]:
rep_customer_id = random.choices(customer_id , k = 201)
rep_dates = random.choices(random_date, k = 201)
In [59]:
#create data frame for invoice

index_names = []
for i in range(201):
    index_names.append(i)

invoice = pd.DataFrame(columns=['invoice_id','customer_id', 'invoice_date',  'amount' ],index = index_names)


invoice['invoice_id'] = id_invoice
invoice['invoice_date']= rep_dates
invoice['customer_id']= rep_customer_id
invoice['amount']= list_of_price
In [60]:
invoice
Out[60]:
invoice_id customer_id invoice_date amount
0 F00 2ABC 2024-05-08 31
1 F01 43ABC 2020-07-03 142
2 F02 94ABC 2023-12-01 49
3 F03 23ABC 2024-05-01 213
4 F04 70ABC 2023-12-20 141
... ... ... ... ...
196 F0196 52ABC 2023-06-29 142
197 F0197 85ABC 2023-08-04 294
198 F0198 82ABC 2020-11-11 123
199 F0199 57ABC 2023-05-27 48
200 F0200 64ABC 2023-10-02 167

201 rows × 4 columns

In [63]:
index_names = []
for i in range(1005):
    index_names.append(i)

product = pd.DataFrame(columns=[ 'category_product', 'pdt_id'], index= index_names) 
product ['pdt_id'] = product_id2
product ['category_product'] =code_pdt_cat

product
Out[63]:
category_product pdt_id
0 EG EG 0
1 EG EG 1
2 EG EG 2
3 EG EG 3
4 EG EG 4
... ... ...
1000 TG TG 196
1001 TG TG 197
1002 TG TG 198
1003 TG TG 199
1004 TG TG 200

1005 rows × 2 columns

In [64]:
rep_invoice_id = random.choices(id_invoice , k = 603)
rep_price_id = random.choices(new , k = 603)
In [65]:
#invoice_item_id sql 

index_names = []
for i in range(603):
    index_names.append(i)

invoice_item = pd.DataFrame(columns=[ 'invoice_id' , 'qty', 'product_id', 'city' ], index= index_names) 

invoice_item ['invoice_id']= rep_invoice_id
invoice_item ['qty']= list_of_quantity
invoice_item ['product_id']= rep_price_id
invoice_item ['city']= list_of_cities


invoice_item
Out[65]:
invoice_id qty product_id city
0 F087 1 HB 596 paris
1 F0104 5 EG 489 paris
2 F052 2 HK 441 marseille
3 F041 7 HK 230 paris
4 F0130 5 CA 235 toulouse
... ... ... ... ...
598 F015 2 CA 269 marseille
599 F073 11 HK 528 paris
600 F0167 9 EG 157 paris
601 F00 2 TG 358 lyon
602 F0194 6 TG 435 nice

603 rows × 4 columns

In [66]:
#transform files to csv  

customer.to_csv('customers.csv', index=False)
invoice.to_csv('invoice.csv', index=False)
invoice_item.to_csv("invoice_item.csv",index=False)
categories.to_csv("category.csv",index=False)
product.to_csv("product.csv",index=False)