data science python data cleaning

The name of the pictureThe name of the pictureThe name of the pictureClash Royale CLAN TAG#URR8PPP











up vote
2
down vote

favorite












I am preparing a dataset for a model, but somehow the code just doesn't run well.



The major error is:




File "/Users/liangjulia/Desktop/UW DS Certificate Learning Material/untitled6.py", line 61
'income2' = pd.to_numeric(Adult.income, errors='coerce')
^
SyntaxError: can't assign to literal



Code:



# import statement
import numpy as np
import pandas as pd
import csv
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
# loading dataset, it is a combination of categorical and numerical data
hp = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",header=0,sep=',')
hp.columns = ['age','workclass','income','education','education-num','marital-status','occupation','relationship','race','sex','capital-gain','capital-loss','hours-per-week','native-country','salary-range']
# dataset basics
hp.head()
hp.shape
hp.dtypes

# account for all value '?'
hp.replace('?','na')
hp.isnull().sum()

#Remove onsolete data point in income
hp('income').dropna()

# replace all aberrant values
hp.replace('nan', 0)
hp.replace('NULL', 0)

# change data type of certain data point to numerical
number = LabelEncoder()
hp['income'] = number.fit_transform(hp['income'.astype('str')])
hp['capital-gain'] = number.fit_transform(hp['capital-gain'.astype('str')])
hp['capital-loss'] = number.fit_transform(hp['capital-loss'.astype('str')])

# Choose the datapoint 'income' to perform the data cleaning and remove outliers
LimitHi=np.mean('income') + 2*np.std('income')
LimitLo=np.mean('income') + 2*np.std('income')
BadIncome = ('income' > LimitHi) & ('income' < LimitLo)

# Replace outliars
RightIncome = ~BadIncome
x[BadIncome] = np.mean(x[RightIncome])

# normalize the Income Column using numpy
#'income2' = pd.to_numeric(Adult.income, errors='coerce')
minmaxscaled =('income' - min('income'))/(max('income') - min('income'))

# bin age data into several ranges
hp['bin'] = pd.cut(hp['age'], [15,30,45,60,75,90])

# construct new categorical data point with existing data point
hp['EvalonInvestment'] = 'zzz'
hp.loc[(hp['capital-gain'] >= 50000), 'loc2'] = 'investmentking'
hp.loc[(hp['capital-gain'] > 10000) & (hp['capital-gain'] < 50000), 'loc2'] = 'good-investment'
hp.loc[(hp['capital-gain'] > 0) & (hp['capital-gain'] <= 10000), 'loc2'] = 'ok-investment'

print(hp)






share|improve this question

























    up vote
    2
    down vote

    favorite












    I am preparing a dataset for a model, but somehow the code just doesn't run well.



    The major error is:




    File "/Users/liangjulia/Desktop/UW DS Certificate Learning Material/untitled6.py", line 61
    'income2' = pd.to_numeric(Adult.income, errors='coerce')
    ^
    SyntaxError: can't assign to literal



    Code:



    # import statement
    import numpy as np
    import pandas as pd
    import csv
    import matplotlib.pyplot as plt
    from sklearn.preprocessing import LabelEncoder
    # loading dataset, it is a combination of categorical and numerical data
    hp = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",header=0,sep=',')
    hp.columns = ['age','workclass','income','education','education-num','marital-status','occupation','relationship','race','sex','capital-gain','capital-loss','hours-per-week','native-country','salary-range']
    # dataset basics
    hp.head()
    hp.shape
    hp.dtypes

    # account for all value '?'
    hp.replace('?','na')
    hp.isnull().sum()

    #Remove onsolete data point in income
    hp('income').dropna()

    # replace all aberrant values
    hp.replace('nan', 0)
    hp.replace('NULL', 0)

    # change data type of certain data point to numerical
    number = LabelEncoder()
    hp['income'] = number.fit_transform(hp['income'.astype('str')])
    hp['capital-gain'] = number.fit_transform(hp['capital-gain'.astype('str')])
    hp['capital-loss'] = number.fit_transform(hp['capital-loss'.astype('str')])

    # Choose the datapoint 'income' to perform the data cleaning and remove outliers
    LimitHi=np.mean('income') + 2*np.std('income')
    LimitLo=np.mean('income') + 2*np.std('income')
    BadIncome = ('income' > LimitHi) & ('income' < LimitLo)

    # Replace outliars
    RightIncome = ~BadIncome
    x[BadIncome] = np.mean(x[RightIncome])

    # normalize the Income Column using numpy
    #'income2' = pd.to_numeric(Adult.income, errors='coerce')
    minmaxscaled =('income' - min('income'))/(max('income') - min('income'))

    # bin age data into several ranges
    hp['bin'] = pd.cut(hp['age'], [15,30,45,60,75,90])

    # construct new categorical data point with existing data point
    hp['EvalonInvestment'] = 'zzz'
    hp.loc[(hp['capital-gain'] >= 50000), 'loc2'] = 'investmentking'
    hp.loc[(hp['capital-gain'] > 10000) & (hp['capital-gain'] < 50000), 'loc2'] = 'good-investment'
    hp.loc[(hp['capital-gain'] > 0) & (hp['capital-gain'] <= 10000), 'loc2'] = 'ok-investment'

    print(hp)






    share|improve this question























      up vote
      2
      down vote

      favorite









      up vote
      2
      down vote

      favorite











      I am preparing a dataset for a model, but somehow the code just doesn't run well.



      The major error is:




      File "/Users/liangjulia/Desktop/UW DS Certificate Learning Material/untitled6.py", line 61
      'income2' = pd.to_numeric(Adult.income, errors='coerce')
      ^
      SyntaxError: can't assign to literal



      Code:



      # import statement
      import numpy as np
      import pandas as pd
      import csv
      import matplotlib.pyplot as plt
      from sklearn.preprocessing import LabelEncoder
      # loading dataset, it is a combination of categorical and numerical data
      hp = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",header=0,sep=',')
      hp.columns = ['age','workclass','income','education','education-num','marital-status','occupation','relationship','race','sex','capital-gain','capital-loss','hours-per-week','native-country','salary-range']
      # dataset basics
      hp.head()
      hp.shape
      hp.dtypes

      # account for all value '?'
      hp.replace('?','na')
      hp.isnull().sum()

      #Remove onsolete data point in income
      hp('income').dropna()

      # replace all aberrant values
      hp.replace('nan', 0)
      hp.replace('NULL', 0)

      # change data type of certain data point to numerical
      number = LabelEncoder()
      hp['income'] = number.fit_transform(hp['income'.astype('str')])
      hp['capital-gain'] = number.fit_transform(hp['capital-gain'.astype('str')])
      hp['capital-loss'] = number.fit_transform(hp['capital-loss'.astype('str')])

      # Choose the datapoint 'income' to perform the data cleaning and remove outliers
      LimitHi=np.mean('income') + 2*np.std('income')
      LimitLo=np.mean('income') + 2*np.std('income')
      BadIncome = ('income' > LimitHi) & ('income' < LimitLo)

      # Replace outliars
      RightIncome = ~BadIncome
      x[BadIncome] = np.mean(x[RightIncome])

      # normalize the Income Column using numpy
      #'income2' = pd.to_numeric(Adult.income, errors='coerce')
      minmaxscaled =('income' - min('income'))/(max('income') - min('income'))

      # bin age data into several ranges
      hp['bin'] = pd.cut(hp['age'], [15,30,45,60,75,90])

      # construct new categorical data point with existing data point
      hp['EvalonInvestment'] = 'zzz'
      hp.loc[(hp['capital-gain'] >= 50000), 'loc2'] = 'investmentking'
      hp.loc[(hp['capital-gain'] > 10000) & (hp['capital-gain'] < 50000), 'loc2'] = 'good-investment'
      hp.loc[(hp['capital-gain'] > 0) & (hp['capital-gain'] <= 10000), 'loc2'] = 'ok-investment'

      print(hp)






      share|improve this question













      I am preparing a dataset for a model, but somehow the code just doesn't run well.



      The major error is:




      File "/Users/liangjulia/Desktop/UW DS Certificate Learning Material/untitled6.py", line 61
      'income2' = pd.to_numeric(Adult.income, errors='coerce')
      ^
      SyntaxError: can't assign to literal



      Code:



      # import statement
      import numpy as np
      import pandas as pd
      import csv
      import matplotlib.pyplot as plt
      from sklearn.preprocessing import LabelEncoder
      # loading dataset, it is a combination of categorical and numerical data
      hp = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",header=0,sep=',')
      hp.columns = ['age','workclass','income','education','education-num','marital-status','occupation','relationship','race','sex','capital-gain','capital-loss','hours-per-week','native-country','salary-range']
      # dataset basics
      hp.head()
      hp.shape
      hp.dtypes

      # account for all value '?'
      hp.replace('?','na')
      hp.isnull().sum()

      #Remove onsolete data point in income
      hp('income').dropna()

      # replace all aberrant values
      hp.replace('nan', 0)
      hp.replace('NULL', 0)

      # change data type of certain data point to numerical
      number = LabelEncoder()
      hp['income'] = number.fit_transform(hp['income'.astype('str')])
      hp['capital-gain'] = number.fit_transform(hp['capital-gain'.astype('str')])
      hp['capital-loss'] = number.fit_transform(hp['capital-loss'.astype('str')])

      # Choose the datapoint 'income' to perform the data cleaning and remove outliers
      LimitHi=np.mean('income') + 2*np.std('income')
      LimitLo=np.mean('income') + 2*np.std('income')
      BadIncome = ('income' > LimitHi) & ('income' < LimitLo)

      # Replace outliars
      RightIncome = ~BadIncome
      x[BadIncome] = np.mean(x[RightIncome])

      # normalize the Income Column using numpy
      #'income2' = pd.to_numeric(Adult.income, errors='coerce')
      minmaxscaled =('income' - min('income'))/(max('income') - min('income'))

      # bin age data into several ranges
      hp['bin'] = pd.cut(hp['age'], [15,30,45,60,75,90])

      # construct new categorical data point with existing data point
      hp['EvalonInvestment'] = 'zzz'
      hp.loc[(hp['capital-gain'] >= 50000), 'loc2'] = 'investmentking'
      hp.loc[(hp['capital-gain'] > 10000) & (hp['capital-gain'] < 50000), 'loc2'] = 'good-investment'
      hp.loc[(hp['capital-gain'] > 0) & (hp['capital-gain'] <= 10000), 'loc2'] = 'ok-investment'

      print(hp)








      share|improve this question












      share|improve this question




      share|improve this question








      edited 18 hours ago









      Stephen Rauch

      1,29341028




      1,29341028









      asked 18 hours ago









      user633599

      111




      111




















          2 Answers
          2






          active

          oldest

          votes

















          up vote
          2
          down vote













          It should be hp['income2'] because you can't assign a mutable object to an immutable object such as a string






          share|improve this answer




























            up vote
            0
            down vote













            What line 61 should read is



            income2 = pd.to_numeric(hp['income'], errors = 'coerce')


            Let's break this down.



            You want to assign a new variable income2 to the output of a method in the pd class, to_numeric.



            to_number takes two arguments: arg : list, tuple, 1-d array, or Series, a choice on how to treat errors, and an optional downcast operator. You find this out by



            help(pd.to_numeric)


            Line 61 originally provided the arg Adult.Income, which could be a list, tuple, 1-d array, or Series, but it hasn't been defined, so it can't be a valid arg.



            Try



            type(hp['income'])



            and you get



            pandas.core.series.Series


            which is a valid argument.



            This line is not the only problem in your code. I suggest that you use iPython so that you can see all of the other errors. Or just use the command line and copy in a few lines at a time and you'll find new error messages to resolve.



            Errors in code are like cockroaches -- they seldom travel alone.



            Good luck!






            share|improve this answer





















              Your Answer




              StackExchange.ifUsing("editor", function ()
              return StackExchange.using("mathjaxEditing", function ()
              StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix)
              StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["$", "$"], ["\\(","\\)"]]);
              );
              );
              , "mathjax-editing");

              StackExchange.ready(function()
              var channelOptions =
              tags: "".split(" "),
              id: "557"
              ;
              initTagRenderer("".split(" "), "".split(" "), channelOptions);

              StackExchange.using("externalEditor", function()
              // Have to fire editor after snippets, if snippets enabled
              if (StackExchange.settings.snippets.snippetsEnabled)
              StackExchange.using("snippets", function()
              createEditor();
              );

              else
              createEditor();

              );

              function createEditor()
              StackExchange.prepareEditor(
              heartbeatType: 'answer',
              convertImagesToLinks: false,
              noModals: false,
              showLowRepImageUploadWarning: true,
              reputationToPostImages: null,
              bindNavPrevention: true,
              postfix: "",
              noCode: true, onDemand: true,
              discardSelector: ".discard-answer"
              ,immediatelyShowMarkdownHelp:true
              );



              );








               

              draft saved


              draft discarded


















              StackExchange.ready(
              function ()
              StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fdatascience.stackexchange.com%2fquestions%2f36496%2fdata-science-python-data-cleaning%23new-answer', 'question_page');

              );

              Post as a guest






























              2 Answers
              2






              active

              oldest

              votes








              2 Answers
              2






              active

              oldest

              votes









              active

              oldest

              votes






              active

              oldest

              votes








              up vote
              2
              down vote













              It should be hp['income2'] because you can't assign a mutable object to an immutable object such as a string






              share|improve this answer

























                up vote
                2
                down vote













                It should be hp['income2'] because you can't assign a mutable object to an immutable object such as a string






                share|improve this answer























                  up vote
                  2
                  down vote










                  up vote
                  2
                  down vote









                  It should be hp['income2'] because you can't assign a mutable object to an immutable object such as a string






                  share|improve this answer













                  It should be hp['income2'] because you can't assign a mutable object to an immutable object such as a string







                  share|improve this answer













                  share|improve this answer



                  share|improve this answer











                  answered 16 hours ago









                  Omkaar.K

                  1406




                  1406




















                      up vote
                      0
                      down vote













                      What line 61 should read is



                      income2 = pd.to_numeric(hp['income'], errors = 'coerce')


                      Let's break this down.



                      You want to assign a new variable income2 to the output of a method in the pd class, to_numeric.



                      to_number takes two arguments: arg : list, tuple, 1-d array, or Series, a choice on how to treat errors, and an optional downcast operator. You find this out by



                      help(pd.to_numeric)


                      Line 61 originally provided the arg Adult.Income, which could be a list, tuple, 1-d array, or Series, but it hasn't been defined, so it can't be a valid arg.



                      Try



                      type(hp['income'])



                      and you get



                      pandas.core.series.Series


                      which is a valid argument.



                      This line is not the only problem in your code. I suggest that you use iPython so that you can see all of the other errors. Or just use the command line and copy in a few lines at a time and you'll find new error messages to resolve.



                      Errors in code are like cockroaches -- they seldom travel alone.



                      Good luck!






                      share|improve this answer

























                        up vote
                        0
                        down vote













                        What line 61 should read is



                        income2 = pd.to_numeric(hp['income'], errors = 'coerce')


                        Let's break this down.



                        You want to assign a new variable income2 to the output of a method in the pd class, to_numeric.



                        to_number takes two arguments: arg : list, tuple, 1-d array, or Series, a choice on how to treat errors, and an optional downcast operator. You find this out by



                        help(pd.to_numeric)


                        Line 61 originally provided the arg Adult.Income, which could be a list, tuple, 1-d array, or Series, but it hasn't been defined, so it can't be a valid arg.



                        Try



                        type(hp['income'])



                        and you get



                        pandas.core.series.Series


                        which is a valid argument.



                        This line is not the only problem in your code. I suggest that you use iPython so that you can see all of the other errors. Or just use the command line and copy in a few lines at a time and you'll find new error messages to resolve.



                        Errors in code are like cockroaches -- they seldom travel alone.



                        Good luck!






                        share|improve this answer























                          up vote
                          0
                          down vote










                          up vote
                          0
                          down vote









                          What line 61 should read is



                          income2 = pd.to_numeric(hp['income'], errors = 'coerce')


                          Let's break this down.



                          You want to assign a new variable income2 to the output of a method in the pd class, to_numeric.



                          to_number takes two arguments: arg : list, tuple, 1-d array, or Series, a choice on how to treat errors, and an optional downcast operator. You find this out by



                          help(pd.to_numeric)


                          Line 61 originally provided the arg Adult.Income, which could be a list, tuple, 1-d array, or Series, but it hasn't been defined, so it can't be a valid arg.



                          Try



                          type(hp['income'])



                          and you get



                          pandas.core.series.Series


                          which is a valid argument.



                          This line is not the only problem in your code. I suggest that you use iPython so that you can see all of the other errors. Or just use the command line and copy in a few lines at a time and you'll find new error messages to resolve.



                          Errors in code are like cockroaches -- they seldom travel alone.



                          Good luck!






                          share|improve this answer













                          What line 61 should read is



                          income2 = pd.to_numeric(hp['income'], errors = 'coerce')


                          Let's break this down.



                          You want to assign a new variable income2 to the output of a method in the pd class, to_numeric.



                          to_number takes two arguments: arg : list, tuple, 1-d array, or Series, a choice on how to treat errors, and an optional downcast operator. You find this out by



                          help(pd.to_numeric)


                          Line 61 originally provided the arg Adult.Income, which could be a list, tuple, 1-d array, or Series, but it hasn't been defined, so it can't be a valid arg.



                          Try



                          type(hp['income'])



                          and you get



                          pandas.core.series.Series


                          which is a valid argument.



                          This line is not the only problem in your code. I suggest that you use iPython so that you can see all of the other errors. Or just use the command line and copy in a few lines at a time and you'll find new error messages to resolve.



                          Errors in code are like cockroaches -- they seldom travel alone.



                          Good luck!







                          share|improve this answer













                          share|improve this answer



                          share|improve this answer











                          answered 14 hours ago









                          Richard Careaga

                          1213




                          1213






















                               

                              draft saved


                              draft discarded


























                               


                              draft saved


                              draft discarded














                              StackExchange.ready(
                              function ()
                              StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fdatascience.stackexchange.com%2fquestions%2f36496%2fdata-science-python-data-cleaning%23new-answer', 'question_page');

                              );

                              Post as a guest













































































                              Comments

                              Popular posts from this blog

                              What is the equation of a 3D cone with generalised tilt?

                              Color the edges and diagonals of a regular polygon

                              Relationship between determinant of matrix and determinant of adjoint?