// file kernel/n/x86/karatsuba.S: Karatsuba multiplication
/*-----------------------------------------------------------------------+
 |  Copyright 2005, Michel Quercia (michel.quercia@prepas.org)           |
 |                                                                       |
 |  This file is part of Numerix. Numerix is free software; you can      |
 |  redistribute it and/or modify it under the terms of the GNU Lesser   |
 |  General Public License as published by the Free Software Foundation; |
 |  either version 2.1 of the License, or (at your option) any later     |
 |  version.                                                             |
 |                                                                       |
 |  The Numerix Library is distributed in the hope that it will be       |
 |  useful, but WITHOUT ANY WARRANTY; without even the implied warranty  |
 |  of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU  |
 |  Lesser General Public License for more details.                      |
 |                                                                       |
 |  You should have received a copy of the GNU Lesser General Public     |
 |  License along with the GNU MP Library; see the file COPYING. If not, |
 |  write to the Free Software Foundation, Inc., 59 Temple Place -       |
 |  Suite 330, Boston, MA 02111-1307, USA.                               |
 +-----------------------------------------------------------------------+
 |                                                                       |
 |                     Multiplication de Karatsuba                       |
 |                                                                       |
 +-----------------------------------------------------------------------*/

                            # +------------------+
                            # |  Multiplication  |
                            # +------------------+
        

# entre :
#   a = naturel de longueur la     esi = &a, edx = la
#   b = naturel de longueur lb     ebx = &b, ecx = lb
#   c = naturel de longueur la+lb  edi = &c
# contraintes : 0 < lb <= la
#
# sortie :
#   c <- a * b
#
# registres modifis :
#   eax,ebx,ecx,edx,esi,edi,ebp <- ind.

#ifdef assembly_sn_karamul
        ALIGN(32)
#ifdef debug_karamul
.Lsn_fkaramul_buggy:
#else
.Lsn_fkaramul:
#endif

#undef L
#define L(x) .Lsn_fkaramul_##x

        # petite multiplication => algorithme en n^2
        cmpl   $karamul_lim, %ecx
        jbe   .Lsn_fmul_n2

        # initialise les variables locales
        #undef  _a_
        #undef  _b_
        #undef  _c_
        #undef  _d_
        #undef  _p_
        #undef  _q_
        #undef  _r_
        #undef  _x_
        #define _d_ 28(%esp)
        #define _x_ 24(%esp)
        #define _p_ 20(%esp)
        #define _q_ 16(%esp)
        #define _r_ 12(%esp)
        #define _a_  8(%esp)
        #define _b_  4(%esp)
        #define _c_   (%esp)
        
        movl   %edx,    %eax
        shrl   $1,      %eax
        adcl   $0,      %eax            # eax <- p = ceil(la/2)
        subl   %eax,    %ecx            # ecx <- r = lb - p
        jbe    L(tranches)              # si lb <= p, dcoupe a en tranches
        subl   %eax,    %edx            # edx <- q = la - p
        leal   (,%eax,8), %ebp
        subl   %ebp,    %esp            # alloue 2p chiffres dans la pile
        pushl  $0                       # x <- 0
        pushl  %eax                     # sauve p
        pushl  %edx                     # sauve q
        pushl  %ecx                     # sauve r
        pushl  %esi                     # sauve &a
        pushl  %ebx                     # sauve &b
        pushl  %edi                     # sauve &c

        # calcule |a0 - a1| et |b0 - b1| dans c
        movl   %edx,    %ecx            # ecx <- q
        movl   %eax,    %edx            # edx <- p
        leal   (%esi,%edx,4), %ebx      # ebx <- &a1
        call   .Lsn_fasub               # c[0..q-1] <- |a0 - a1|
        adcl   %ecx,    _x_             # x <- signe(a0-a1)
        
        movl   _b_,     %esi
        movl   _c_,     %edi
        movl   _r_,     %ecx
        movl   _p_,     %edx            # edx <- p
        leal   (%esi,%edx,4), %ebx      # ebx <- &b1
        leal   (%edi,%edx,4), %edi      # edi <- &c[p]
        call   .Lsn_fasub               # c[p..2p-1] <- |b0 - b1|
        adcl   %ecx,    _x_             # x ^= signe(b0-b1)

        # d <- |a0-a1|*|b0-b1|
        movl   _c_,     %esi
        movl   _p_,     %edx
        movl   %edx,    %ecx            # ecx <- p
        leal   (%esi,%edx,4), %ebx      # ebx <- &c[p]
        leal   _d_,     %edi
        call   .Lsn_fkaramul

        # c <- a1b1 : a0b0
        movl   _a_,     %esi
        movl   _b_,     %ebx
        movl   _c_,     %edi
        movl   _p_,     %ecx
        movl   %ecx,    %edx            # edx <- p
        call   .Lsn_fkaramul
        
        movl    _c_,    %edi
        movl   _b_,     %ebx
        movl   _a_,     %esi
        movl   _r_,     %ecx
        movl   _q_,     %edx
        movl   _p_,     %eax
        leal   (%esi,%eax,4), %esi      # esi <- &a1
        leal   (%ebx,%eax,4), %ebx      # ebx <- &b1
        leal   (%edi,%eax,8), %edi      # edi <- &c1
        call   .Lsn_fkaramul

        # point de chute pour karasqr
.Lsn_kara_aux:
#ifdef use_sse2

        # prpare l addition croise
        movl   _p_,     %eax
        movl   _q_,     %ecx
        addl   _r_,     %ecx
        subl   %eax,    %ecx            # ecx <- q+r-p
        movl   %ecx,    _q_             # le sauve  la place de q
        movl   _c_,     %esi
        leal   (%esi,%ecx,4), %esi      # esi <- &c0[q+r-p]
        leal   (%esi,%eax,4), %edi      # edi <- &c1[q+r-p]
        leal   (%edi,%eax,4), %ebx      # ebx <- &c2[q+r-p]
        leal   (%ebx,%eax,4), %edx      # edx <- &c3[q+r-p]
        leal   _d_,     %ebp
        leal   (%ebp,%ecx,4), %ebp      # ebp <- &d0[q+r-p]
        leal   (%ebp,%eax,4), %eax      # eax <- &d1[q+r-p]
        bt     $0,      _x_
        jc     L(negatif)

        # c += (a0b0 + a1b1 - |a0-a1*|b0-b1|)*BASE^p
        pxor   %mm0,    %mm0            # init retenues
        pxor   %mm1,    %mm1
        negl   %ecx
        jz     2f
        ALIGN(4)
1:
        movd   (%edi,%ecx,4), %mm2      # mm2 <- c1[i]
        movd   (%ebx,%ecx,4), %mm3      # mm3 <- c2[i]
        paddq  %mm3,    %mm2            # mm2 <- c1[i] + c2[i]
        paddq  %mm2,    %mm0            # mm0 <- r(c1) + c1[i] + c2[i]
        paddq  %mm2,    %mm1            # mm1 <- r(c2) + c1[i] + c2[i]
        movd   (%esi,%ecx,4), %mm2      # mm2 <- c0[i]
        movd   (%edx,%ecx,4), %mm3      # mm3 <- c3[i]
        paddq  %mm2,    %mm0            # mm0 <- r(c1) + c1[i] + c2[i] + c0[i]
        paddq  %mm3,    %mm1            # mm1 <- r(c2) + c1[i] + c2[i] + c3[i]
        movd   (%ebp,%ecx,4), %mm2      # mm2 <- d0[i]
        movd   (%eax,%ecx,4), %mm3      # mm3 <- d1[i]
        psubq  %mm2,    %mm0            # mm0 <- r(c1) + c1[i] + c2[i] + c0[i] - d0[i]
        psubq  %mm3,    %mm1            # mm1 <- r(c2) + c1[i] + c2[i] + c3[i] - d1[i]
        movd   %mm0,   (%edi,%ecx,4)    # mise  jour c1[i]
        movd   %mm1,   (%ebx,%ecx,4)    # mise  jour c2[i]
        incl   %ecx
        pshufw $0xfe,   %mm0, %mm0      # mm0 <- nouvelle retenue pour c1
        pshufw $0xfe,   %mm1, %mm1      # mm1 <- nouvelle retenue pour c2
        jne    1b
2:

        movl   _p_,     %ecx
        subl   _q_,     %ecx            # ecx <- 2p-q-r
        jz     2f
        leal   (%esi,%ecx,4), %esi      # esi <- &c0[p]
        leal   (%edi,%ecx,4), %edi      # edi <- &c1[p]
        leal   (%ebx,%ecx,4), %ebx      # ebx <- &c2[p]
        leal   (%ebp,%ecx,4), %ebp      # ebp <- &d0[p]
        leal   (%eax,%ecx,4), %eax      # eax <- &d1[p]
        negl   %ecx
        ALIGN(4)
1:
        movd   (%edi,%ecx,4), %mm2      # mm2 <- c1[i]
        movd   (%ebx,%ecx,4), %mm3      # mm3 <- c2[i]
        paddq  %mm3,    %mm2            # mm2 <- c1[i] + c2[i]
        paddq  %mm2,    %mm0            # mm0 <- r(c1) + c1[i] + c2[i]
        paddq  %mm2,    %mm1            # mm1 <- r(c2) + c1[i] + c2[i]
        movd   (%esi,%ecx,4), %mm2      # mm2 <- c0[i]
        paddq  %mm2,    %mm0            # mm0 <- r(c1) + c1[i] + c2[i] + c0[i]
        movd   (%ebp,%ecx,4), %mm2      # mm2 <- d0[i]
        movd   (%eax,%ecx,4), %mm3      # mm3 <- d1[i]
        psubq  %mm2,    %mm0            # mm0 <- r(c1) + c1[i] + c2[i] + c0[i] - d0[i]
        psubq  %mm3,    %mm1            # mm1 <- r(c2) + c1[i] + c2[i] - d1[i]
        movd   %mm0,   (%edi,%ecx,4)    # mise  jour c1[i]
        movd   %mm1,   (%ebx,%ecx,4)    # mise  jour c2[i]
        incl   %ecx
        pshufw $0xfe,   %mm0, %mm0      # mm0 <- nouvelle retenue pour c1
        pshufw $0xfe,   %mm1, %mm1      # mm1 <- nouvelle retenue pour c2
        jne    1b

2:
        movd   %mm0,    %edx            # edx <- retenue sur c2
        testl  %edx,    %edx
        jns    L(add_mm0)               # si >= 0, va ajouter
        movd   %mm1,    %edx            # edx <- retenue sur c3
        movl   _p_,     %ecx
        negl   %ecx
1:
        subl   $1,     (%ebx,%ecx,4)    # c2 <- c2 - 1
        jnb    L(add_mm1)
        incl   %ecx
        jne    1b
        decl   %edx
        jmp    L(add_mm1)
        
        # c += (a0b0 + a1b1 + |a0-a1*|b0-b1|)*BASE^p
        ALIGN(4)
L(negatif):
        pxor   %mm0,    %mm0            # init retenues
        pxor   %mm1,    %mm1
        negl   %ecx
        jz     2f
        ALIGN(4)
1:
        movd   (%edi,%ecx,4), %mm2      # mm2 <- c1[i]
        movd   (%ebx,%ecx,4), %mm3      # mm3 <- c2[i]
        paddq  %mm3,    %mm2            # mm2 <- c1[i] + c2[i]
        paddq  %mm2,    %mm0            # mm0 <- r(c1) + c1[i] + c2[i]
        paddq  %mm2,    %mm1            # mm1 <- r(c2) + c1[i] + c2[i]
        movd   (%esi,%ecx,4), %mm2      # mm2 <- c0[i]
        movd   (%edx,%ecx,4), %mm3      # mm3 <- c3[i]
        paddq  %mm2,    %mm0            # mm0 <- r(c1) + c1[i] + c2[i] + c0[i]
        paddq  %mm3,    %mm1            # mm1 <- r(c2) + c1[i] + c2[i] + c3[i]
        movd   (%ebp,%ecx,4), %mm2      # mm2 <- d0[i]
        movd   (%eax,%ecx,4), %mm3      # mm3 <- d1[i]
        paddq  %mm2,    %mm0            # mm0 <- r(c1) + c1[i] + c2[i] + c0[i] + d0[i]
        paddq  %mm3,    %mm1            # mm1 <- r(c2) + c1[i] + c2[i] + c3[i] + d1[i]
        movd   %mm0,   (%edi,%ecx,4)    # mise  jour c1[i]
        movd   %mm1,   (%ebx,%ecx,4)    # mise  jout c2[i]
        incl   %ecx
        pshufw $0xfe,   %mm0, %mm0      # mm0 <- nouvelle retenue pour c1
        pshufw $0xfe,   %mm1, %mm1      # mm1 <- nouvelle retenue pour c2
        jne    1b
2:
        
        movl   _p_,     %ecx
        subl   _q_,     %ecx            # ecx <- 2p-q-r
        jz     2f
        leal   (%esi,%ecx,4), %esi      # esi <- &c0[p]
        leal   (%edi,%ecx,4), %edi      # edi <- &c1[p]
        leal   (%ebx,%ecx,4), %ebx      # ebx <- &c2[p]
        leal   (%ebp,%ecx,4), %ebp      # ebp <- &d0[p]
        leal   (%eax,%ecx,4), %eax      # eax <- &d1[p]
        negl   %ecx
        ALIGN(4)
1:
        movd   (%edi,%ecx,4), %mm2      # mm2 <- c1[i]
        movd   (%ebx,%ecx,4), %mm3      # mm3 <- c2[i]
        paddq  %mm3,    %mm2            # mm2 <- c1[i] + c2[i]
        paddq  %mm2,    %mm0            # mm0 <- r(c1) + c1[i] + c2[i]
        paddq  %mm2,    %mm1            # mm1 <- r(c2) + c1[i] + c2[i]
        movd   (%esi,%ecx,4), %mm2      # mm2 <- c0[i]
        paddq  %mm2,    %mm0            # mm0 <- r(c1) + c1[i] + c2[i] + c0[i]
        movd   (%ebp,%ecx,4), %mm2      # mm2 <- d0[i]
        movd   (%eax,%ecx,4), %mm3      # mm3 <- d1[i]
        paddq  %mm2,    %mm0            # mm0 <- r(c1) + c1[i] + c2[i] + c0[i] + d0[i]
        paddq  %mm3,    %mm1            # mm1 <- r(c2) + c1[i] + c2[i] + d1[i]
        movd   %mm0,   (%edi,%ecx,4)    # mise  jour c1[i]
        movd   %mm1,   (%ebx,%ecx,4)    # mise  jout c2[i]
        incl   %ecx
        pshufw $0xfe,   %mm0, %mm0      # mm0 <- nouvelle retenue pour c1
        pshufw $0xfe,   %mm1, %mm1      # mm1 <- nouvelle retenue pour c2
        jne    1b

2:
        movd   %mm0,    %edx            # edx <- retenue sur c2
L(add_mm0):
        movl   _p_,     %ecx
        negl   %ecx
        addl   %edx,   (%ebx,%ecx,4)    # c2 <- c2 + ret
        movd   %mm1,    %edx            # edx <- retenue sur c3
        jnc    L(add_mm1)
        incl   %ecx
1:
        incl   (%ebx,%ecx,4)
        jne    L(add_mm1)
        incl   %ecx     
        jne    1b
        incl   %edx
        
L(add_mm1):
        addl   %edx,   (%ebx)           # c3 <- c3 + ret
        jnc    L(done)
1:
        leal  4(%ebx),  %ebx
        incl  (%ebx)
        jz    1b

L(done):
        emms                            # rinitialise le FPU
        movl   %eax,    %esp            # nettoie la pile
        ret     

#else /* sse2 */
        
        movl   _r_,     %eax
        addl   %eax,    _q_             # q <- q+r
        
        # c += (a0b0 + a1b1)*BASE^p
        movl   _c_,     %ebx
        movl   _p_,     %ecx
        leal   (%ebx,%ecx,4), %ebx      # ebx <- &c[p]
        leal   (%ebx,%ecx,4), %esi      # esi <- &c[2p]
        call   .Lsn_finc_1              # c[2p..3p-1] += c[p..2p-1]
        rcll   $1,      _r_             # r[0] <- retenue
        
        movl   _c_,     %esi
        movl   _p_,     %ecx
        leal   (%esi,%ecx,4), %edi      # edi <- &c[p]
        call   .Lsn_fadd_1              # c[p..2p-1] <- c[0..p-1] + c[2p..3p-1]
        rcll   $1,      _r_             # r[1] <- retenue
        
        movl   %edi,    %esi            # esi <- &c[2p]
        movl   _q_,     %edx
        movl   %edx,    %ecx
        subl   _p_,     %ecx            # ecx <- q+r-p
        jz     L(short_c)
        call   .Lsn_finc                # c[2p..2p+q+r-1] += c[3p..2p+q+r-1]

        # propage la premire retenue sur c[3p..2p+q+r-1]
        bt     $1,      _r_
        jnc    L(short_c)
        movl   _p_,     %edx
        subl   _q_,     %edx            # edx <- p - q - r
L(ret_1):
        incl   (%ebx,%edx,4)
        jnz    L(short_c)
        incl   %edx
        jnz    L(ret_1)
L(short_c):

        # propage les deux retenues sur c[2p..2p+q+r-1]
        movl   %ecx,    %eax
        bt     $0,      _r_
        adcl   %ecx,    %eax            # eax <- 1re retenue
        subl   _q_,     %ecx            # ecx <- -(q+r)
        bt     $1,      _r_             # CF <- 2me retenue
        adcl   %eax,    (%ebx,%ecx,4)
        jnc    L(done_ret)
L(ret_2):
        incl   %ecx
        jz     L(done_ret)
        incl   (%ebx,%ecx,4)
        jz     L(ret_2)
L(done_ret):

        # c[p..2p+q+r-1] -= (a0 - a1)*(b0 - b1)
        leal   _d_,     %ebx
        movl   _c_,     %esi
        movl   _p_,     %ecx
        leal   (%esi,%ecx,4), %esi      # esi <- &c[p]
        movl   %ecx,    %edx
        addl   _q_,     %edx            # edx <- p+q+r
        leal   (,%ecx,2), %ecx          # ecx <- 2p
        bt     $0,      _x_
        jc     L(negatif)

        call   .Lsn_fdec                # cas (a0-a1)*(b0-b1) >= 0
        movl   %ebx,    %esp            # nettoie la pile
        ret

L(negatif):
        call   .Lsn_finc                # cas (a0-a1)*(b0-b1) >= 0
        movl   %ebx,    %esp            # nettoie la pile
        ret

#endif /* sse2 */
        
        # ici lb <= ceil(la/2) : dcoupage en tranches de longueur lb
        ALIGN(4)
L(tranches):
        addl   %eax,    %ecx            # ecx <- lb

        # Le code qui suit est recopi mot  mot dans toommul, en remplaant
        # les deux appels  sn_fkaramul par des appels  sn_ftoommul.
        # Attention  rpercuter les mises  jour !

        # variables locales
        #undef  _a_
        #undef  _b_
        #undef  _c_
        #undef  _d_
        #undef  _la_
        #undef  _lb_
        #define _d_  20(%esp)
        #define _la_ 16(%esp)
        #define _lb_ 12(%esp)
        #define _a_   8(%esp)
        #define _b_   4(%esp)
        #define _c_    (%esp)
        
        leal   (,%ecx,4), %eax
        subl   %eax,    %esp            # rserve lb chiffres dans la pile
        pushl  %edx                     # sauve la
        pushl  %ecx                     # sauve lb

        # premire multiplication : c <- a[0..(la % lb)-1]*b
        movl   %edx,    %eax
        movl   $0,      %edx            # edx:eax <- la
        divl   %ecx                     # edx <- la % lb
        testl  %edx,    %edx            # si la est multiple de lb ...
        jnz    1f
        movl   %ecx,    %edx
1:
        xchgl  %ebx,    %esi            # permute les arguments ...
        xchgl  %ecx,    %edx            # pour avoir edx >= ecx
        leal   (%ebx,%ecx,4), %eax
        pushl  %eax                     # a += a[la % lb]
        pushl  %esi                     # sauve &b
        leal   (%edi,%ecx,4), %eax
        pushl  %eax                     # c += c[la % lb]
        subl   %ecx,    _la_            # la -= la % lb
        call   .Lsn_fkaramul

        # multiplications suivantes
        ALIGN(4)
L(loop):
        movl   _c_,     %esi
        leal   _d_,     %edi
        movl   _lb_,    %ecx
        cld;   rep movsl                # d <- c[0..lb-1]
        
        movl   _c_,     %edi
        movl   _b_,     %esi
        movl   _a_,     %ebx
        movl   _lb_,    %edx
        movl   %edx,    %ecx            # ecx <- lb
        call   .Lsn_fkaramul            # c[0..2lb-1] <- a[0..lb-1]*b

        movl   _c_,     %esi
        leal   _d_,     %ebx
        movl   _lb_,    %ecx
        leal   (,%ecx,2), %edx          # edx <- 2*lb
        call   .Lsn_finc                # c <- c + d

        movl   _lb_,    %eax
        leal   (,%eax,4), %ecx
        addl   %ecx,    _c_             # c+=lb
        addl   %ecx,    _a_             # a+=lb
        subl   %eax,    _la_            # la -= lb
        jne    L(loop)

        # termin
        leal   20(%esp,%eax,4), %esp    # nettoie la pile
        ret

                              # +---------------+
                              # |  Interface C  |
                              # +---------------+

#  void xn(karamul)(chiffre *a, long la, chiffre *b, long lb, chiffre *c)
#
#  entre :
#  a = naturel de longueur la
#  b = naturel de longueur lb
#  c = naturel de longueur la+lb, non confondu avec a ou b
#  contraintes : 0 < lb <= la
#
#  sortie :
#  c <- a*b

#ifdef debug_karamul
ENTER(sn_karamul_buggy)
#else
ENTER(sn_karamul)
#endif

        movl   arg1,    %esi            # esi <- &a
        movl   arg2,    %edx            # edx <- la
        movl   arg3,    %ebx            # ebx <- &b
        movl   arg4,    %ecx            # ecx <- lb
        movl   arg5,    %edi            # edi <- &c
#ifdef debug_karamul
        call   .Lsn_fkaramul_buggy      # effectue la multiplication
#else
        call   .Lsn_fkaramul
#endif
        RETURN_WITH_SP
#endif /* assembly_sn_karamul */

        # cas o la version assembleur est dsactive ou dbogue :
        # sn_fkaramul renvoie vers la version C

#if !defined(assembly_sn_karamul) || defined(debug_karamul)
        ALIGN(32)
.Lsn_fkaramul:

        pushl  %edi
        pushl  %ecx
        pushl  %ebx
        pushl  %edx
        pushl  %esi
        call   SUBR(sn_karamul)
        leal   20(%esp), %esp
        ret
        
#endif /* !defined(assembly_sn_karamul) || defined(debug_karamul) */


                                 # +---------+
                                 # |  Carr  |
                                 # +---------+

# entre :
#   a = naturel de longueur la     esi = &a, edx = la
#   c = naturel de longueur 2*la   edi = &c
# contraintes : 0 < la
#
# sortie :
#   c <- a^2
#
# registres modifis :
#   eax,ebx,ecx,edx,esi,edi,ebp <- ind.

#ifdef assembly_sn_karasqr
        ALIGN(32)
#ifdef debug_karamul
.Lsn_fkarasqr_buggy:
#else
.Lsn_fkarasqr:
#endif

#undef L
#define L(x) .Lsn_fkarasqr_##x

        # petit carr => algorithme en n^2
        cmpl   $karasqr_lim, %edx
        jbe   .Lsn_fsqr_n2
        
        # initialise les variables locales
        #undef  _a_
        #undef  _b_
        #undef  _c_
        #undef  _d_
        #undef  _p_
        #undef  _q_
        #undef  _r_
        #undef  _x_
        #define _d_ 28(%esp)
        #define _x_ 24(%esp)
        #define _p_ 20(%esp)
        #define _q_ 16(%esp)
        #define _r_ 12(%esp)
        #define _a_  8(%esp)
        #define _b_  4(%esp)
        #define _c_   (%esp)
        
        movl   %edx,    %eax
        shrl   $1,      %eax
        adcl   $0,      %eax            # eax <- p = ceil(la/2)
        subl   %eax,    %edx            # edx <- q = la - p
        leal   (,%eax,8), %ebp
        subl   %ebp,    %esp            # alloue 2p chiffres dans la pile
        pushl  $0                       # x <- 0
        pushl  %eax                     # sauve p
        pushl  %edx                     # sauve q
        pushl  %edx                     # sauve r (= q)
        pushl  %esi                     # sauve &a
        pushl  %esi                     # sauve &b (= &a)
        pushl  %edi                     # sauve &c

        # calcule |a0 - a1| dans c
        movl   %edx,    %ecx            # ecx <- q
        movl   %eax,    %edx            # edx <- p
        leal   (%esi,%edx,4), %ebx      # ebx <- &a1
        call   .Lsn_fasub               # c[0..q-1] <- |a0 - a1|
        
        # d <- (a0-a1)^2
        movl   _c_,     %esi
        movl   _p_,     %edx
        leal   _d_,     %edi
        call   .Lsn_fkarasqr

        # c <- a1^2 : a0^2
        movl   _a_,     %esi
        movl   _c_,     %edi
        movl   _p_,     %edx
        call   .Lsn_fkarasqr
        
        movl    _c_,    %edi
        movl   _a_,     %esi
        movl   _q_,     %edx
        movl   _p_,     %eax
        leal   (%esi,%eax,4), %esi      # esi <- &a1
        leal   (%edi,%eax,8), %edi      # edi <- &c1
        call   .Lsn_fkarasqr

        jmp    .Lsn_kara_aux            # continue dans karamul
        
                              # +---------------+
                              # |  interface C  |
                              # +---------------+

#  void xn(karasqr)(chiffre *a, long la, chiffre *b)
#
#  entre :
#  a = naturel de longueur la
#  b = naturel de longueur 2*la, non confondu avec a
#  contraintes : 0 < la
#
#  sortie :
#  b <- a^2

#ifdef debug_karamul
ENTER(sn_karasqr_buggy)
#else
ENTER(sn_karasqr)
#endif

        movl   arg1,    %esi            # esi <- &a
        movl   arg2,    %edx            # edx <- la
        movl   arg3,    %edi            # edi <- &b
#ifdef debug_karamul
        call   .Lsn_fkarasqr_buggy      # calcule le carr
#else
        call   .Lsn_fkarasqr
#endif
        RETURN_WITH_SP
#endif /* assembly_sn_karasqr */

        # cas o la version assembleur est dsactive ou dbogue :
        # sn_fkarasqr renvoie vers la version C
        
#if !defined(assembly_sn_karasqr) || defined(debug_karamul)
        ALIGN(32)
.Lsn_fkarasqr:

        pushl  %edi
        pushl  %edx
        pushl  %esi
        call   SUBR(sn_karasqr)
        leal   12(%esp), %esp
        ret
        
#endif /* !defined(assembly_sn_karasqr) || defined(debug_karamul) */


